llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -7,7 +7,7 @@ 
     | 
|
| 
       7 
7 
     | 
    
         
             
            #include "ggml-alloc.h"
         
     | 
| 
       8 
8 
     | 
    
         
             
            #include "ggml-backend.h"
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
            #ifdef  
     | 
| 
      
 10 
     | 
    
         
            +
            #ifdef GGML_USE_CUDA
         
     | 
| 
       11 
11 
     | 
    
         
             
            #  include "ggml-cuda.h"
         
     | 
| 
       12 
12 
     | 
    
         
             
            #elif defined(GGML_USE_CLBLAST)
         
     | 
| 
       13 
13 
     | 
    
         
             
            #  include "ggml-opencl.h"
         
     | 
| 
         @@ -52,12 +52,16 @@ 
     | 
|
| 
       52 
52 
     | 
    
         
             
                    #define NOMINMAX
         
     | 
| 
       53 
53 
     | 
    
         
             
                #endif
         
     | 
| 
       54 
54 
     | 
    
         
             
                #include <windows.h>
         
     | 
| 
      
 55 
     | 
    
         
            +
                #ifndef PATH_MAX
         
     | 
| 
      
 56 
     | 
    
         
            +
                    #define PATH_MAX MAX_PATH
         
     | 
| 
      
 57 
     | 
    
         
            +
                #endif
         
     | 
| 
       55 
58 
     | 
    
         
             
                #include <io.h>
         
     | 
| 
       56 
59 
     | 
    
         
             
            #endif
         
     | 
| 
       57 
60 
     | 
    
         | 
| 
       58 
61 
     | 
    
         
             
            #include <algorithm>
         
     | 
| 
       59 
62 
     | 
    
         
             
            #include <array>
         
     | 
| 
       60 
63 
     | 
    
         
             
            #include <cassert>
         
     | 
| 
      
 64 
     | 
    
         
            +
            #include <cctype>
         
     | 
| 
       61 
65 
     | 
    
         
             
            #include <cfloat>
         
     | 
| 
       62 
66 
     | 
    
         
             
            #include <cinttypes>
         
     | 
| 
       63 
67 
     | 
    
         
             
            #include <climits>
         
     | 
| 
         @@ -68,7 +72,6 @@ 
     | 
|
| 
       68 
72 
     | 
    
         
             
            #include <cstdio>
         
     | 
| 
       69 
73 
     | 
    
         
             
            #include <cstring>
         
     | 
| 
       70 
74 
     | 
    
         
             
            #include <ctime>
         
     | 
| 
       71 
     | 
    
         
            -
            #include <cwctype>
         
     | 
| 
       72 
75 
     | 
    
         
             
            #include <forward_list>
         
     | 
| 
       73 
76 
     | 
    
         
             
            #include <fstream>
         
     | 
| 
       74 
77 
     | 
    
         
             
            #include <functional>
         
     | 
| 
         @@ -192,6 +195,7 @@ enum llm_arch { 
     | 
|
| 
       192 
195 
     | 
    
         
             
                LLM_ARCH_LLAMA,
         
     | 
| 
       193 
196 
     | 
    
         
             
                LLM_ARCH_FALCON,
         
     | 
| 
       194 
197 
     | 
    
         
             
                LLM_ARCH_BAICHUAN,
         
     | 
| 
      
 198 
     | 
    
         
            +
                LLM_ARCH_GROK,
         
     | 
| 
       195 
199 
     | 
    
         
             
                LLM_ARCH_GPT2,
         
     | 
| 
       196 
200 
     | 
    
         
             
                LLM_ARCH_GPTJ,
         
     | 
| 
       197 
201 
     | 
    
         
             
                LLM_ARCH_GPTNEOX,
         
     | 
| 
         @@ -214,6 +218,7 @@ enum llm_arch { 
     | 
|
| 
       214 
218 
     | 
    
         
             
                LLM_ARCH_GEMMA,
         
     | 
| 
       215 
219 
     | 
    
         
             
                LLM_ARCH_STARCODER2,
         
     | 
| 
       216 
220 
     | 
    
         
             
                LLM_ARCH_MAMBA,
         
     | 
| 
      
 221 
     | 
    
         
            +
                LLM_ARCH_XVERSE,
         
     | 
| 
       217 
222 
     | 
    
         
             
                LLM_ARCH_COMMAND_R,
         
     | 
| 
       218 
223 
     | 
    
         
             
                LLM_ARCH_UNKNOWN,
         
     | 
| 
       219 
224 
     | 
    
         
             
            };
         
     | 
| 
         @@ -221,6 +226,7 @@ enum llm_arch { 
     | 
|
| 
       221 
226 
     | 
    
         
             
            static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
         
     | 
| 
       222 
227 
     | 
    
         
             
                { LLM_ARCH_LLAMA,           "llama"      },
         
     | 
| 
       223 
228 
     | 
    
         
             
                { LLM_ARCH_FALCON,          "falcon"     },
         
     | 
| 
      
 229 
     | 
    
         
            +
                { LLM_ARCH_GROK,            "grok"       },
         
     | 
| 
       224 
230 
     | 
    
         
             
                { LLM_ARCH_GPT2,            "gpt2"       },
         
     | 
| 
       225 
231 
     | 
    
         
             
                { LLM_ARCH_GPTJ,            "gptj"       },
         
     | 
| 
       226 
232 
     | 
    
         
             
                { LLM_ARCH_GPTNEOX,         "gptneox"    },
         
     | 
| 
         @@ -244,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { 
     | 
|
| 
       244 
250 
     | 
    
         
             
                { LLM_ARCH_GEMMA,           "gemma"      },
         
     | 
| 
       245 
251 
     | 
    
         
             
                { LLM_ARCH_STARCODER2,      "starcoder2" },
         
     | 
| 
       246 
252 
     | 
    
         
             
                { LLM_ARCH_MAMBA,           "mamba"      },
         
     | 
| 
      
 253 
     | 
    
         
            +
                { LLM_ARCH_XVERSE,          "xverse"     },
         
     | 
| 
       247 
254 
     | 
    
         
             
                { LLM_ARCH_COMMAND_R,       "command-r"  },
         
     | 
| 
       248 
255 
     | 
    
         
             
                { LLM_ARCH_UNKNOWN,         "(unknown)"  },
         
     | 
| 
       249 
256 
     | 
    
         
             
            };
         
     | 
| 
         @@ -254,6 +261,7 @@ enum llm_kv { 
     | 
|
| 
       254 
261 
     | 
    
         
             
                LLM_KV_GENERAL_ALIGNMENT,
         
     | 
| 
       255 
262 
     | 
    
         
             
                LLM_KV_GENERAL_NAME,
         
     | 
| 
       256 
263 
     | 
    
         
             
                LLM_KV_GENERAL_AUTHOR,
         
     | 
| 
      
 264 
     | 
    
         
            +
                LLM_KV_GENERAL_VERSION,
         
     | 
| 
       257 
265 
     | 
    
         
             
                LLM_KV_GENERAL_URL,
         
     | 
| 
       258 
266 
     | 
    
         
             
                LLM_KV_GENERAL_DESCRIPTION,
         
     | 
| 
       259 
267 
     | 
    
         
             
                LLM_KV_GENERAL_LICENSE,
         
     | 
| 
         @@ -290,6 +298,10 @@ enum llm_kv { 
     | 
|
| 
       290 
298 
     | 
    
         
             
                LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
         
     | 
| 
       291 
299 
     | 
    
         
             
                LLM_KV_ROPE_SCALING_FINETUNED,
         
     | 
| 
       292 
300 
     | 
    
         | 
| 
      
 301 
     | 
    
         
            +
                LLM_KV_SPLIT_NO,
         
     | 
| 
      
 302 
     | 
    
         
            +
                LLM_KV_SPLIT_COUNT,
         
     | 
| 
      
 303 
     | 
    
         
            +
                LLM_KV_SPLIT_TENSORS_COUNT,
         
     | 
| 
      
 304 
     | 
    
         
            +
             
     | 
| 
       293 
305 
     | 
    
         
             
                LLM_KV_SSM_INNER_SIZE,
         
     | 
| 
       294 
306 
     | 
    
         
             
                LLM_KV_SSM_CONV_KERNEL,
         
     | 
| 
       295 
307 
     | 
    
         
             
                LLM_KV_SSM_STATE_SIZE,
         
     | 
| 
         @@ -306,6 +318,8 @@ enum llm_kv { 
     | 
|
| 
       306 
318 
     | 
    
         
             
                LLM_KV_TOKENIZER_UNK_ID,
         
     | 
| 
       307 
319 
     | 
    
         
             
                LLM_KV_TOKENIZER_SEP_ID,
         
     | 
| 
       308 
320 
     | 
    
         
             
                LLM_KV_TOKENIZER_PAD_ID,
         
     | 
| 
      
 321 
     | 
    
         
            +
                LLM_KV_TOKENIZER_CLS_ID,
         
     | 
| 
      
 322 
     | 
    
         
            +
                LLM_KV_TOKENIZER_MASK_ID,
         
     | 
| 
       309 
323 
     | 
    
         
             
                LLM_KV_TOKENIZER_ADD_BOS,
         
     | 
| 
       310 
324 
     | 
    
         
             
                LLM_KV_TOKENIZER_ADD_EOS,
         
     | 
| 
       311 
325 
     | 
    
         
             
                LLM_KV_TOKENIZER_ADD_PREFIX,
         
     | 
| 
         @@ -319,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { 
     | 
|
| 
       319 
333 
     | 
    
         
             
                { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     },
         
     | 
| 
       320 
334 
     | 
    
         
             
                { LLM_KV_GENERAL_NAME,                  "general.name"                          },
         
     | 
| 
       321 
335 
     | 
    
         
             
                { LLM_KV_GENERAL_AUTHOR,                "general.author"                        },
         
     | 
| 
      
 336 
     | 
    
         
            +
                { LLM_KV_GENERAL_VERSION,               "general.version"                       },
         
     | 
| 
       322 
337 
     | 
    
         
             
                { LLM_KV_GENERAL_URL,                   "general.url"                           },
         
     | 
| 
       323 
338 
     | 
    
         
             
                { LLM_KV_GENERAL_DESCRIPTION,           "general.description"                   },
         
     | 
| 
       324 
339 
     | 
    
         
             
                { LLM_KV_GENERAL_LICENSE,               "general.license"                       },
         
     | 
| 
         @@ -355,6 +370,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { 
     | 
|
| 
       355 
370 
     | 
    
         
             
                { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
         
     | 
| 
       356 
371 
     | 
    
         
             
                { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
         
     | 
| 
       357 
372 
     | 
    
         | 
| 
      
 373 
     | 
    
         
            +
                { LLM_KV_SPLIT_NO,                      "split.no"            },
         
     | 
| 
      
 374 
     | 
    
         
            +
                { LLM_KV_SPLIT_COUNT,                   "split.count"         },
         
     | 
| 
      
 375 
     | 
    
         
            +
                { LLM_KV_SPLIT_TENSORS_COUNT,           "split.tensors.count" },
         
     | 
| 
      
 376 
     | 
    
         
            +
             
     | 
| 
       358 
377 
     | 
    
         
             
                { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
         
     | 
| 
       359 
378 
     | 
    
         
             
                { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
         
     | 
| 
       360 
379 
     | 
    
         
             
                { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
         
     | 
| 
         @@ -371,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { 
     | 
|
| 
       371 
390 
     | 
    
         
             
                { LLM_KV_TOKENIZER_UNK_ID,              "tokenizer.ggml.unknown_token_id"   },
         
     | 
| 
       372 
391 
     | 
    
         
             
                { LLM_KV_TOKENIZER_SEP_ID,              "tokenizer.ggml.seperator_token_id" },
         
     | 
| 
       373 
392 
     | 
    
         
             
                { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
         
     | 
| 
      
 393 
     | 
    
         
            +
                { LLM_KV_TOKENIZER_CLS_ID,              "tokenizer.ggml.cls_token_id"       },
         
     | 
| 
      
 394 
     | 
    
         
            +
                { LLM_KV_TOKENIZER_MASK_ID,             "tokenizer.ggml.mask_token_id"      },
         
     | 
| 
       374 
395 
     | 
    
         
             
                { LLM_KV_TOKENIZER_ADD_BOS,             "tokenizer.ggml.add_bos_token"      },
         
     | 
| 
       375 
396 
     | 
    
         
             
                { LLM_KV_TOKENIZER_ADD_EOS,             "tokenizer.ggml.add_eos_token"      },
         
     | 
| 
       376 
397 
     | 
    
         
             
                { LLM_KV_TOKENIZER_ADD_PREFIX,          "tokenizer.ggml.add_space_prefix"   },
         
     | 
| 
         @@ -411,9 +432,12 @@ enum llm_tensor { 
     | 
|
| 
       411 
432 
     | 
    
         
             
                LLM_TENSOR_FFN_DOWN,
         
     | 
| 
       412 
433 
     | 
    
         
             
                LLM_TENSOR_FFN_UP,
         
     | 
| 
       413 
434 
     | 
    
         
             
                LLM_TENSOR_FFN_ACT,
         
     | 
| 
       414 
     | 
    
         
            -
                LLM_TENSOR_FFN_DOWN_EXP,
         
     | 
| 
      
 435 
     | 
    
         
            +
                LLM_TENSOR_FFN_DOWN_EXP,  // split experts for backward compatibility
         
     | 
| 
       415 
436 
     | 
    
         
             
                LLM_TENSOR_FFN_GATE_EXP,
         
     | 
| 
       416 
437 
     | 
    
         
             
                LLM_TENSOR_FFN_UP_EXP,
         
     | 
| 
      
 438 
     | 
    
         
            +
                LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
         
     | 
| 
      
 439 
     | 
    
         
            +
                LLM_TENSOR_FFN_GATE_EXPS,
         
     | 
| 
      
 440 
     | 
    
         
            +
                LLM_TENSOR_FFN_UP_EXPS,
         
     | 
| 
       417 
441 
     | 
    
         
             
                LLM_TENSOR_ATTN_Q_NORM,
         
     | 
| 
       418 
442 
     | 
    
         
             
                LLM_TENSOR_ATTN_K_NORM,
         
     | 
| 
       419 
443 
     | 
    
         
             
                LLM_TENSOR_LAYER_OUT_NORM,
         
     | 
| 
         @@ -448,6 +472,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA 
     | 
|
| 
       448 
472 
     | 
    
         
             
                        { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
         
     | 
| 
       449 
473 
     | 
    
         
             
                        { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
         
     | 
| 
       450 
474 
     | 
    
         
             
                        { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
         
     | 
| 
      
 475 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
         
     | 
| 
      
 476 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
         
     | 
| 
      
 477 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         
     | 
| 
       451 
478 
     | 
    
         
             
                    },
         
     | 
| 
       452 
479 
     | 
    
         
             
                },
         
     | 
| 
       453 
480 
     | 
    
         
             
                {
         
     | 
| 
         @@ -483,6 +510,31 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA 
     | 
|
| 
       483 
510 
     | 
    
         
             
                        { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         
     | 
| 
       484 
511 
     | 
    
         
             
                    },
         
     | 
| 
       485 
512 
     | 
    
         
             
                },
         
     | 
| 
      
 513 
     | 
    
         
            +
                {
         
     | 
| 
      
 514 
     | 
    
         
            +
                    LLM_ARCH_GROK,
         
     | 
| 
      
 515 
     | 
    
         
            +
                    {
         
     | 
| 
      
 516 
     | 
    
         
            +
                        { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
         
     | 
| 
      
 517 
     | 
    
         
            +
                        { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
         
     | 
| 
      
 518 
     | 
    
         
            +
                        { LLM_TENSOR_OUTPUT,          "output" },
         
     | 
| 
      
 519 
     | 
    
         
            +
                        { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
         
     | 
| 
      
 520 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
         
     | 
| 
      
 521 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
         
     | 
| 
      
 522 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
         
     | 
| 
      
 523 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
         
     | 
| 
      
 524 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
         
     | 
| 
      
 525 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
         
     | 
| 
      
 526 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
         
     | 
| 
      
 527 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
         
     | 
| 
      
 528 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
         
     | 
| 
      
 529 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
         
     | 
| 
      
 530 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
         
     | 
| 
      
 531 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
         
     | 
| 
      
 532 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
         
     | 
| 
      
 533 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         
     | 
| 
      
 534 
     | 
    
         
            +
                        { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
         
     | 
| 
      
 535 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
         
     | 
| 
      
 536 
     | 
    
         
            +
                    },
         
     | 
| 
      
 537 
     | 
    
         
            +
                },
         
     | 
| 
       486 
538 
     | 
    
         
             
                {
         
     | 
| 
       487 
539 
     | 
    
         
             
                    LLM_ARCH_GPT2,
         
     | 
| 
       488 
540 
     | 
    
         
             
                    {
         
     | 
| 
         @@ -548,6 +600,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA 
     | 
|
| 
       548 
600 
     | 
    
         
             
                        { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         
     | 
| 
       549 
601 
     | 
    
         
             
                        { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         
     | 
| 
       550 
602 
     | 
    
         
             
                        { LLM_TENSOR_FFN_ACT,         "blk.%d.ffn.act" },
         
     | 
| 
      
 603 
     | 
    
         
            +
                        { LLM_TENSOR_POS_EMBD,        "position_embd" },
         
     | 
| 
      
 604 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
         
     | 
| 
      
 605 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
         
     | 
| 
       551 
606 
     | 
    
         
             
                    },
         
     | 
| 
       552 
607 
     | 
    
         
             
                },
         
     | 
| 
       553 
608 
     | 
    
         
             
                {
         
     | 
| 
         @@ -843,6 +898,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA 
     | 
|
| 
       843 
898 
     | 
    
         
             
                        { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
         
     | 
| 
       844 
899 
     | 
    
         
             
                    },
         
     | 
| 
       845 
900 
     | 
    
         
             
                },
         
     | 
| 
      
 901 
     | 
    
         
            +
                {
         
     | 
| 
      
 902 
     | 
    
         
            +
                    LLM_ARCH_XVERSE,
         
     | 
| 
      
 903 
     | 
    
         
            +
                    {
         
     | 
| 
      
 904 
     | 
    
         
            +
                        { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
         
     | 
| 
      
 905 
     | 
    
         
            +
                        { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
         
     | 
| 
      
 906 
     | 
    
         
            +
                        { LLM_TENSOR_OUTPUT,          "output" },
         
     | 
| 
      
 907 
     | 
    
         
            +
                        { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
         
     | 
| 
      
 908 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
         
     | 
| 
      
 909 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
         
     | 
| 
      
 910 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
         
     | 
| 
      
 911 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
         
     | 
| 
      
 912 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
         
     | 
| 
      
 913 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
         
     | 
| 
      
 914 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
         
     | 
| 
      
 915 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
         
     | 
| 
      
 916 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         
     | 
| 
      
 917 
     | 
    
         
            +
                        { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         
     | 
| 
      
 918 
     | 
    
         
            +
                    },
         
     | 
| 
      
 919 
     | 
    
         
            +
                },
         
     | 
| 
       846 
920 
     | 
    
         
             
                {
         
     | 
| 
       847 
921 
     | 
    
         
             
                    LLM_ARCH_COMMAND_R,
         
     | 
| 
       848 
922 
     | 
    
         
             
                    {
         
     | 
| 
         @@ -856,6 +930,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA 
     | 
|
| 
       856 
930 
     | 
    
         
             
                        { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
         
     | 
| 
       857 
931 
     | 
    
         
             
                        { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
         
     | 
| 
       858 
932 
     | 
    
         
             
                        { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         
     | 
| 
      
 933 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
         
     | 
| 
      
 934 
     | 
    
         
            +
                        { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         
     | 
| 
       859 
935 
     | 
    
         
             
                    },
         
     | 
| 
       860 
936 
     | 
    
         
             
                },
         
     | 
| 
       861 
937 
     | 
    
         
             
                {
         
     | 
| 
         @@ -1030,7 +1106,7 @@ struct llama_file { 
     | 
|
| 
       1030 
1106 
     | 
    
         
             
                size_t size;
         
     | 
| 
       1031 
1107 
     | 
    
         | 
| 
       1032 
1108 
     | 
    
         
             
                llama_file(const char * fname, const char * mode) {
         
     | 
| 
       1033 
     | 
    
         
            -
                    fp =  
     | 
| 
      
 1109 
     | 
    
         
            +
                    fp = ggml_fopen(fname, mode);
         
     | 
| 
       1034 
1110 
     | 
    
         
             
                    if (fp == NULL) {
         
     | 
| 
       1035 
1111 
     | 
    
         
             
                        throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
         
     | 
| 
       1036 
1112 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -1099,6 +1175,7 @@ struct llama_file { 
     | 
|
| 
       1099 
1175 
     | 
    
         
             
                    }
         
     | 
| 
       1100 
1176 
     | 
    
         
             
                }
         
     | 
| 
       1101 
1177 
     | 
    
         
             
            };
         
     | 
| 
      
 1178 
     | 
    
         
            +
            using llama_files = std::vector<std::unique_ptr<llama_file>>;
         
     | 
| 
       1102 
1179 
     | 
    
         | 
| 
       1103 
1180 
     | 
    
         
             
            struct llama_mmap {
         
     | 
| 
       1104 
1181 
     | 
    
         
             
                void * addr;
         
     | 
| 
         @@ -1299,6 +1376,7 @@ struct llama_mmap { 
     | 
|
| 
       1299 
1376 
     | 
    
         
             
                }
         
     | 
| 
       1300 
1377 
     | 
    
         
             
            #endif
         
     | 
| 
       1301 
1378 
     | 
    
         
             
            };
         
     | 
| 
      
 1379 
     | 
    
         
            +
            using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
         
     | 
| 
       1302 
1380 
     | 
    
         | 
| 
       1303 
1381 
     | 
    
         
             
            // Represents some region of memory being locked using mlock or VirtualLock;
         
     | 
| 
       1304 
1382 
     | 
    
         
             
            // will automatically unlock on destruction.
         
     | 
| 
         @@ -1448,6 +1526,7 @@ struct llama_mlock { 
     | 
|
| 
       1448 
1526 
     | 
    
         
             
                static void raw_unlock(const void * addr, size_t len) {}
         
     | 
| 
       1449 
1527 
     | 
    
         
             
            #endif
         
     | 
| 
       1450 
1528 
     | 
    
         
             
            };
         
     | 
| 
      
 1529 
     | 
    
         
            +
            using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
         
     | 
| 
       1451 
1530 
     | 
    
         | 
| 
       1452 
1531 
     | 
    
         
             
            static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
         
     | 
| 
       1453 
1532 
     | 
    
         
             
                std::vector<char> result(8, 0);
         
     | 
| 
         @@ -1467,7 +1546,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ 
     | 
|
| 
       1467 
1546 
     | 
    
         
             
            static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
         
     | 
| 
       1468 
1547 
     | 
    
         
             
                ggml_backend_buffer_type_t buft = nullptr;
         
     | 
| 
       1469 
1548 
     | 
    
         | 
| 
       1470 
     | 
    
         
            -
            #if defined( 
     | 
| 
      
 1549 
     | 
    
         
            +
            #if defined(GGML_USE_CUDA)
         
     | 
| 
       1471 
1550 
     | 
    
         
             
                // host buffers should only be used when data is expected to be copied to/from the GPU
         
     | 
| 
       1472 
1551 
     | 
    
         
             
                if (host_buffer) {
         
     | 
| 
       1473 
1552 
     | 
    
         
             
                    buft = ggml_backend_cuda_host_buffer_type();
         
     | 
| 
         @@ -1497,7 +1576,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { 
     | 
|
| 
       1497 
1576 
     | 
    
         | 
| 
       1498 
1577 
     | 
    
         
             
            #ifdef GGML_USE_METAL
         
     | 
| 
       1499 
1578 
     | 
    
         
             
                buft = ggml_backend_metal_buffer_type();
         
     | 
| 
       1500 
     | 
    
         
            -
            #elif defined( 
     | 
| 
      
 1579 
     | 
    
         
            +
            #elif defined(GGML_USE_CUDA)
         
     | 
| 
       1501 
1580 
     | 
    
         
             
                buft = ggml_backend_cuda_buffer_type(gpu);
         
     | 
| 
       1502 
1581 
     | 
    
         
             
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       1503 
1582 
     | 
    
         
             
                buft = ggml_backend_vk_buffer_type(gpu);
         
     | 
| 
         @@ -1523,7 +1602,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { 
     | 
|
| 
       1523 
1602 
     | 
    
         
             
            static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
         
     | 
| 
       1524 
1603 
     | 
    
         
             
                ggml_backend_buffer_type_t buft = nullptr;
         
     | 
| 
       1525 
1604 
     | 
    
         | 
| 
       1526 
     | 
    
         
            -
            #ifdef  
     | 
| 
      
 1605 
     | 
    
         
            +
            #ifdef GGML_USE_CUDA
         
     | 
| 
       1527 
1606 
     | 
    
         
             
                if (ggml_backend_cuda_get_device_count() > 1) {
         
     | 
| 
       1528 
1607 
     | 
    
         
             
                    buft = ggml_backend_cuda_split_buffer_type(tensor_split);
         
     | 
| 
       1529 
1608 
     | 
    
         
             
                }
         
     | 
| 
         @@ -1544,7 +1623,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g 
     | 
|
| 
       1544 
1623 
     | 
    
         
             
            }
         
     | 
| 
       1545 
1624 
     | 
    
         | 
| 
       1546 
1625 
     | 
    
         
             
            static size_t llama_get_device_count() {
         
     | 
| 
       1547 
     | 
    
         
            -
            #if defined( 
     | 
| 
      
 1626 
     | 
    
         
            +
            #if defined(GGML_USE_CUDA)
         
     | 
| 
       1548 
1627 
     | 
    
         
             
                return ggml_backend_cuda_get_device_count();
         
     | 
| 
       1549 
1628 
     | 
    
         
             
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       1550 
1629 
     | 
    
         
             
                return ggml_backend_sycl_get_device_count();
         
     | 
| 
         @@ -1556,20 +1635,20 @@ static size_t llama_get_device_count() { 
     | 
|
| 
       1556 
1635 
     | 
    
         
             
            }
         
     | 
| 
       1557 
1636 
     | 
    
         | 
| 
       1558 
1637 
     | 
    
         
             
            static size_t llama_get_device_memory(int device) {
         
     | 
| 
       1559 
     | 
    
         
            -
            #if defined( 
     | 
| 
      
 1638 
     | 
    
         
            +
            #if defined(GGML_USE_CUDA)
         
     | 
| 
       1560 
1639 
     | 
    
         
             
                size_t total;
         
     | 
| 
       1561 
1640 
     | 
    
         
             
                size_t free;
         
     | 
| 
       1562 
     | 
    
         
            -
                ggml_backend_cuda_get_device_memory(device, & 
     | 
| 
      
 1641 
     | 
    
         
            +
                ggml_backend_cuda_get_device_memory(device, &free, &total);
         
     | 
| 
       1563 
1642 
     | 
    
         
             
                return free;
         
     | 
| 
       1564 
1643 
     | 
    
         
             
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       1565 
1644 
     | 
    
         
             
                size_t total;
         
     | 
| 
       1566 
1645 
     | 
    
         
             
                size_t free;
         
     | 
| 
       1567 
     | 
    
         
            -
                ggml_backend_sycl_get_device_memory(device, & 
     | 
| 
      
 1646 
     | 
    
         
            +
                ggml_backend_sycl_get_device_memory(device, &free, &total);
         
     | 
| 
       1568 
1647 
     | 
    
         
             
                return free;
         
     | 
| 
       1569 
1648 
     | 
    
         
             
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       1570 
1649 
     | 
    
         
             
                size_t total;
         
     | 
| 
       1571 
1650 
     | 
    
         
             
                size_t free;
         
     | 
| 
       1572 
     | 
    
         
            -
                ggml_backend_vk_get_device_memory(device, & 
     | 
| 
      
 1651 
     | 
    
         
            +
                ggml_backend_vk_get_device_memory(device, &free, &total);
         
     | 
| 
       1573 
1652 
     | 
    
         
             
                return free;
         
     | 
| 
       1574 
1653 
     | 
    
         
             
            #else
         
     | 
| 
       1575 
1654 
     | 
    
         
             
                return 1;
         
     | 
| 
         @@ -1621,10 +1700,13 @@ enum e_model { 
     | 
|
| 
       1621 
1700 
     | 
    
         
             
                MODEL_40B,
         
     | 
| 
       1622 
1701 
     | 
    
         
             
                MODEL_65B,
         
     | 
| 
       1623 
1702 
     | 
    
         
             
                MODEL_70B,
         
     | 
| 
      
 1703 
     | 
    
         
            +
                MODEL_314B,
         
     | 
| 
       1624 
1704 
     | 
    
         
             
                MODEL_SMALL,
         
     | 
| 
       1625 
1705 
     | 
    
         
             
                MODEL_MEDIUM,
         
     | 
| 
       1626 
1706 
     | 
    
         
             
                MODEL_LARGE,
         
     | 
| 
       1627 
1707 
     | 
    
         
             
                MODEL_XL,
         
     | 
| 
      
 1708 
     | 
    
         
            +
                MODEL_8x7B,
         
     | 
| 
      
 1709 
     | 
    
         
            +
                MODEL_8x22B,
         
     | 
| 
       1628 
1710 
     | 
    
         
             
            };
         
     | 
| 
       1629 
1711 
     | 
    
         | 
| 
       1630 
1712 
     | 
    
         
             
            static const size_t kiB = 1024;
         
     | 
| 
         @@ -1738,6 +1820,7 @@ struct llama_cparams { 
     | 
|
| 
       1738 
1820 
     | 
    
         
             
                uint32_t n_ctx;           // context size used during inference
         
     | 
| 
       1739 
1821 
     | 
    
         
             
                uint32_t n_batch;
         
     | 
| 
       1740 
1822 
     | 
    
         
             
                uint32_t n_ubatch;
         
     | 
| 
      
 1823 
     | 
    
         
            +
                uint32_t n_seq_max;
         
     | 
| 
       1741 
1824 
     | 
    
         
             
                uint32_t n_threads;       // number of threads to use for generation
         
     | 
| 
       1742 
1825 
     | 
    
         
             
                uint32_t n_threads_batch; // number of threads to use for batch processing
         
     | 
| 
       1743 
1826 
     | 
    
         | 
| 
         @@ -1803,9 +1886,9 @@ struct llama_layer { 
     | 
|
| 
       1803 
1886 
     | 
    
         | 
| 
       1804 
1887 
     | 
    
         
             
                // ff MoE
         
     | 
| 
       1805 
1888 
     | 
    
         
             
                struct ggml_tensor * ffn_gate_inp;
         
     | 
| 
       1806 
     | 
    
         
            -
                struct ggml_tensor *  
     | 
| 
       1807 
     | 
    
         
            -
                struct ggml_tensor *  
     | 
| 
       1808 
     | 
    
         
            -
                struct ggml_tensor *  
     | 
| 
      
 1889 
     | 
    
         
            +
                struct ggml_tensor * ffn_gate_exps;
         
     | 
| 
      
 1890 
     | 
    
         
            +
                struct ggml_tensor * ffn_down_exps;
         
     | 
| 
      
 1891 
     | 
    
         
            +
                struct ggml_tensor * ffn_up_exps ;
         
     | 
| 
       1809 
1892 
     | 
    
         | 
| 
       1810 
1893 
     | 
    
         
             
                // ff bias
         
     | 
| 
       1811 
1894 
     | 
    
         
             
                struct ggml_tensor * ffn_down_b; // b2
         
     | 
| 
         @@ -1941,11 +2024,13 @@ struct llama_vocab { 
     | 
|
| 
       1941 
2024 
     | 
    
         
             
                std::map<std::pair<std::string, std::string>, int> bpe_ranks;
         
     | 
| 
       1942 
2025 
     | 
    
         | 
| 
       1943 
2026 
     | 
    
         
             
                // default LLaMA special tokens
         
     | 
| 
       1944 
     | 
    
         
            -
                id special_bos_id 
     | 
| 
       1945 
     | 
    
         
            -
                id special_eos_id 
     | 
| 
       1946 
     | 
    
         
            -
                id special_unk_id 
     | 
| 
       1947 
     | 
    
         
            -
                id special_sep_id 
     | 
| 
       1948 
     | 
    
         
            -
                id special_pad_id 
     | 
| 
      
 2027 
     | 
    
         
            +
                id special_bos_id  = 1;
         
     | 
| 
      
 2028 
     | 
    
         
            +
                id special_eos_id  = 2;
         
     | 
| 
      
 2029 
     | 
    
         
            +
                id special_unk_id  = 0;
         
     | 
| 
      
 2030 
     | 
    
         
            +
                id special_sep_id  = -1;
         
     | 
| 
      
 2031 
     | 
    
         
            +
                id special_pad_id  = -1;
         
     | 
| 
      
 2032 
     | 
    
         
            +
                id special_cls_id  = -1;
         
     | 
| 
      
 2033 
     | 
    
         
            +
                id special_mask_id = -1;
         
     | 
| 
       1949 
2034 
     | 
    
         | 
| 
       1950 
2035 
     | 
    
         
             
                int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
         
     | 
| 
       1951 
2036 
     | 
    
         
             
                int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
         
     | 
| 
         @@ -2023,12 +2108,12 @@ struct llama_model { 
     | 
|
| 
       2023 
2108 
     | 
    
         
             
                // the model memory buffers for the tensor data
         
     | 
| 
       2024 
2109 
     | 
    
         
             
                std::vector<ggml_backend_buffer_t> bufs;
         
     | 
| 
       2025 
2110 
     | 
    
         | 
| 
       2026 
     | 
    
         
            -
                // model memory mapped  
     | 
| 
       2027 
     | 
    
         
            -
                 
     | 
| 
      
 2111 
     | 
    
         
            +
                // model memory mapped files
         
     | 
| 
      
 2112 
     | 
    
         
            +
                llama_mmaps mappings;
         
     | 
| 
       2028 
2113 
     | 
    
         | 
| 
       2029 
2114 
     | 
    
         
             
                // objects representing data potentially being locked in memory
         
     | 
| 
       2030 
     | 
    
         
            -
                 
     | 
| 
       2031 
     | 
    
         
            -
                 
     | 
| 
      
 2115 
     | 
    
         
            +
                llama_mlocks mlock_bufs;
         
     | 
| 
      
 2116 
     | 
    
         
            +
                llama_mlocks mlock_mmaps;
         
     | 
| 
       2032 
2117 
     | 
    
         | 
| 
       2033 
2118 
     | 
    
         
             
                // for quantize-stats only
         
     | 
| 
       2034 
2119 
     | 
    
         
             
                std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
         
     | 
| 
         @@ -2041,7 +2126,7 @@ struct llama_model { 
     | 
|
| 
       2041 
2126 
     | 
    
         
             
                        ggml_free(ctx);
         
     | 
| 
       2042 
2127 
     | 
    
         
             
                    }
         
     | 
| 
       2043 
2128 
     | 
    
         
             
                    for (ggml_backend_buffer_t buf : bufs) {
         
     | 
| 
       2044 
     | 
    
         
            -
            #ifdef  
     | 
| 
      
 2129 
     | 
    
         
            +
            #ifdef GGML_USE_CUDA
         
     | 
| 
       2045 
2130 
     | 
    
         
             
                        if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
         
     | 
| 
       2046 
2131 
     | 
    
         
             
                            ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
         
     | 
| 
       2047 
2132 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -2060,10 +2145,6 @@ struct llama_context { 
     | 
|
| 
       2060 
2145 
     | 
    
         
             
                        ggml_backend_free(backend);
         
     | 
| 
       2061 
2146 
     | 
    
         
             
                    }
         
     | 
| 
       2062 
2147 
     | 
    
         | 
| 
       2063 
     | 
    
         
            -
            #ifdef GGML_USE_VULKAN
         
     | 
| 
       2064 
     | 
    
         
            -
                    ggml_vk_free_cpu_assist();
         
     | 
| 
       2065 
     | 
    
         
            -
            #endif
         
     | 
| 
       2066 
     | 
    
         
            -
             
     | 
| 
       2067 
2148 
     | 
    
         
             
                    ggml_backend_buffer_free(buf_output);
         
     | 
| 
       2068 
2149 
     | 
    
         
             
                }
         
     | 
| 
       2069 
2150 
     | 
    
         | 
| 
         @@ -2100,20 +2181,20 @@ struct llama_context { 
     | 
|
| 
       2100 
2181 
     | 
    
         
             
                // host buffer for the model output (logits and embeddings)
         
     | 
| 
       2101 
2182 
     | 
    
         
             
                ggml_backend_buffer_t buf_output = nullptr;
         
     | 
| 
       2102 
2183 
     | 
    
         | 
| 
       2103 
     | 
    
         
            -
                // decode output (2-dimensional array: [ 
     | 
| 
       2104 
     | 
    
         
            -
                size_t 
     | 
| 
       2105 
     | 
    
         
            -
                float * logits 
     | 
| 
      
 2184 
     | 
    
         
            +
                // decode output (2-dimensional array: [n_outputs][n_vocab])
         
     | 
| 
      
 2185 
     | 
    
         
            +
                size_t  logits_size = 0; // capacity (of floats) for logits
         
     | 
| 
      
 2186 
     | 
    
         
            +
                float * logits      = nullptr;
         
     | 
| 
      
 2187 
     | 
    
         
            +
             
     | 
| 
      
 2188 
     | 
    
         
            +
                std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
         
     | 
| 
      
 2189 
     | 
    
         
            +
                size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
         
     | 
| 
      
 2190 
     | 
    
         
            +
                int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
         
     | 
| 
       2106 
2191 
     | 
    
         | 
| 
       2107 
     | 
    
         
            -
            #ifndef NDEBUG
         
     | 
| 
       2108 
     | 
    
         
            -
                // guard against access to unset logits
         
     | 
| 
       2109 
     | 
    
         
            -
                std::vector<bool>  logits_valid;
         
     | 
| 
       2110 
     | 
    
         
            -
            #endif
         
     | 
| 
       2111 
2192 
     | 
    
         
             
                bool logits_all = false;
         
     | 
| 
       2112 
2193 
     | 
    
         | 
| 
       2113 
     | 
    
         
            -
                // embeddings output (2-dimensional array: [ 
     | 
| 
      
 2194 
     | 
    
         
            +
                // embeddings output (2-dimensional array: [n_outputs][n_embd])
         
     | 
| 
       2114 
2195 
     | 
    
         
             
                // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
         
     | 
| 
       2115 
     | 
    
         
            -
                size_t 
     | 
| 
       2116 
     | 
    
         
            -
                float * embd 
     | 
| 
      
 2196 
     | 
    
         
            +
                size_t  embd_size = 0; // capacity (of floats) for embeddings
         
     | 
| 
      
 2197 
     | 
    
         
            +
                float * embd      = nullptr;
         
     | 
| 
       2117 
2198 
     | 
    
         | 
| 
       2118 
2199 
     | 
    
         
             
                // sequence embeddings output (map of [n_embd] vectors)
         
     | 
| 
       2119 
2200 
     | 
    
         
             
                // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
         
     | 
| 
         @@ -2130,14 +2211,15 @@ struct llama_context { 
     | 
|
| 
       2130 
2211 
     | 
    
         
             
                struct ggml_tensor * inp_tokens;    // I32 [n_batch]
         
     | 
| 
       2131 
2212 
     | 
    
         
             
                struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
         
     | 
| 
       2132 
2213 
     | 
    
         
             
                struct ggml_tensor * inp_pos;       // I32 [n_batch]
         
     | 
| 
      
 2214 
     | 
    
         
            +
                struct ggml_tensor * inp_out_ids;   // I32 [n_outputs]
         
     | 
| 
       2133 
2215 
     | 
    
         
             
                struct ggml_tensor * inp_KQ_mask;   // F32 [kv_size, n_batch]
         
     | 
| 
       2134 
     | 
    
         
            -
                struct ggml_tensor * inp_KQ_pos;    // F32 [ 
     | 
| 
      
 2216 
     | 
    
         
            +
                struct ggml_tensor * inp_KQ_pos;    // F32 [n_kv]
         
     | 
| 
       2135 
2217 
     | 
    
         
             
                struct ggml_tensor * inp_K_shift;   // I32 [kv_size]
         
     | 
| 
       2136 
2218 
     | 
    
         
             
                struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
         
     | 
| 
       2137 
2219 
     | 
    
         
             
                struct ggml_tensor * inp_cls;       // I32 [n_batch]
         
     | 
| 
       2138 
2220 
     | 
    
         
             
                struct ggml_tensor * inp_s_copy;    // I32 [kv_size]
         
     | 
| 
       2139 
     | 
    
         
            -
                struct ggml_tensor * inp_s_mask;    // F32 [1,  
     | 
| 
       2140 
     | 
    
         
            -
                struct ggml_tensor * inp_s_seq;     // I32 [ 
     | 
| 
      
 2221 
     | 
    
         
            +
                struct ggml_tensor * inp_s_mask;    // F32 [1, n_kv]
         
     | 
| 
      
 2222 
     | 
    
         
            +
                struct ggml_tensor * inp_s_seq;     // I32 [n_kv, n_batch]
         
     | 
| 
       2141 
2223 
     | 
    
         | 
| 
       2142 
2224 
     | 
    
         
             
                // control vectors
         
     | 
| 
       2143 
2225 
     | 
    
         
             
                struct llama_control_vector cvec;
         
     | 
| 
         @@ -2792,6 +2874,8 @@ namespace GGUFMeta { 
     | 
|
| 
       2792 
2874 
     | 
    
         
             
                };
         
     | 
| 
       2793 
2875 
     | 
    
         
             
            }
         
     | 
| 
       2794 
2876 
     | 
    
         | 
| 
      
 2877 
     | 
    
         
            +
            using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
         
     | 
| 
      
 2878 
     | 
    
         
            +
             
     | 
| 
       2795 
2879 
     | 
    
         
             
            struct llama_model_loader {
         
     | 
| 
       2796 
2880 
     | 
    
         
             
                int n_kv      = 0;
         
     | 
| 
       2797 
2881 
     | 
    
         
             
                int n_tensors = 0;
         
     | 
| 
         @@ -2802,54 +2886,133 @@ struct llama_model_loader { 
     | 
|
| 
       2802 
2886 
     | 
    
         | 
| 
       2803 
2887 
     | 
    
         
             
                bool use_mmap = false;
         
     | 
| 
       2804 
2888 
     | 
    
         | 
| 
       2805 
     | 
    
         
            -
                 
     | 
| 
      
 2889 
     | 
    
         
            +
                llama_files files;
         
     | 
| 
       2806 
2890 
     | 
    
         
             
                llama_ftype ftype;
         
     | 
| 
       2807 
2891 
     | 
    
         
             
                llama_fver  fver;
         
     | 
| 
       2808 
2892 
     | 
    
         | 
| 
       2809 
     | 
    
         
            -
                 
     | 
| 
      
 2893 
     | 
    
         
            +
                llama_mmaps mappings;
         
     | 
| 
      
 2894 
     | 
    
         
            +
             
     | 
| 
      
 2895 
     | 
    
         
            +
                // Holds information on a model weight
         
     | 
| 
      
 2896 
     | 
    
         
            +
                struct llama_tensor_weight {
         
     | 
| 
      
 2897 
     | 
    
         
            +
                    uint16_t  idx; // source file index
         
     | 
| 
      
 2898 
     | 
    
         
            +
                    size_t   offs; // tensor data offset in the original file
         
     | 
| 
      
 2899 
     | 
    
         
            +
             
     | 
| 
      
 2900 
     | 
    
         
            +
                    ggml_tensor * tensor;
         
     | 
| 
      
 2901 
     | 
    
         
            +
             
     | 
| 
      
 2902 
     | 
    
         
            +
                    llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
         
     | 
| 
      
 2903 
     | 
    
         
            +
                        const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
         
     | 
| 
      
 2904 
     | 
    
         
            +
                        offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
         
     | 
| 
      
 2905 
     | 
    
         
            +
                    }
         
     | 
| 
      
 2906 
     | 
    
         
            +
                };
         
     | 
| 
      
 2907 
     | 
    
         
            +
                std::vector<llama_tensor_weight> weights;
         
     | 
| 
      
 2908 
     | 
    
         
            +
             
     | 
| 
       2810 
2909 
     | 
    
         
             
                std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
         
     | 
| 
       2811 
2910 
     | 
    
         | 
| 
       2812 
     | 
    
         
            -
                struct gguf_context *  
     | 
| 
       2813 
     | 
    
         
            -
                 
     | 
| 
      
 2911 
     | 
    
         
            +
                struct gguf_context * meta = NULL;
         
     | 
| 
      
 2912 
     | 
    
         
            +
                std::vector<ggml_context *> contexts;
         
     | 
| 
       2814 
2913 
     | 
    
         | 
| 
       2815 
2914 
     | 
    
         
             
                std::string arch_name;
         
     | 
| 
       2816 
2915 
     | 
    
         
             
                LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
         
     | 
| 
       2817 
2916 
     | 
    
         | 
| 
       2818 
     | 
    
         
            -
                llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p)  
     | 
| 
      
 2917 
     | 
    
         
            +
                llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
         
     | 
| 
       2819 
2918 
     | 
    
         
             
                    int trace = 0;
         
     | 
| 
       2820 
2919 
     | 
    
         
             
                    if (getenv("LLAMA_TRACE")) {
         
     | 
| 
       2821 
2920 
     | 
    
         
             
                        trace = atoi(getenv("LLAMA_TRACE"));
         
     | 
| 
       2822 
2921 
     | 
    
         
             
                    }
         
     | 
| 
       2823 
2922 
     | 
    
         | 
| 
       2824 
     | 
    
         
            -
                    struct gguf_init_params params = {
         
     | 
| 
       2825 
     | 
    
         
            -
                        /*.no_alloc = */ true,
         
     | 
| 
       2826 
     | 
    
         
            -
                        /*.ctx      = */ &ctx_meta,
         
     | 
| 
       2827 
     | 
    
         
            -
                    };
         
     | 
| 
       2828 
     | 
    
         
            -
             
     | 
| 
       2829 
2923 
     | 
    
         
             
                    if (param_overrides_p != nullptr) {
         
     | 
| 
       2830 
2924 
     | 
    
         
             
                        for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
         
     | 
| 
       2831 
2925 
     | 
    
         
             
                            kv_overrides.insert({std::string(p->key), *p});
         
     | 
| 
       2832 
2926 
     | 
    
         
             
                        }
         
     | 
| 
       2833 
2927 
     | 
    
         
             
                    }
         
     | 
| 
       2834 
2928 
     | 
    
         | 
| 
       2835 
     | 
    
         
            -
                     
     | 
| 
       2836 
     | 
    
         
            -
                     
     | 
| 
      
 2929 
     | 
    
         
            +
                    struct ggml_context * ctx = NULL;
         
     | 
| 
      
 2930 
     | 
    
         
            +
                    struct gguf_init_params params = {
         
     | 
| 
      
 2931 
     | 
    
         
            +
                        /*.no_alloc = */ true,
         
     | 
| 
      
 2932 
     | 
    
         
            +
                        /*.ctx      = */ &ctx,
         
     | 
| 
      
 2933 
     | 
    
         
            +
                    };
         
     | 
| 
      
 2934 
     | 
    
         
            +
             
     | 
| 
      
 2935 
     | 
    
         
            +
                    meta = gguf_init_from_file(fname.c_str(), params);
         
     | 
| 
      
 2936 
     | 
    
         
            +
                    if (!meta) {
         
     | 
| 
       2837 
2937 
     | 
    
         
             
                        throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         
     | 
| 
       2838 
2938 
     | 
    
         
             
                    }
         
     | 
| 
       2839 
2939 
     | 
    
         | 
| 
       2840 
2940 
     | 
    
         
             
                    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         
     | 
| 
       2841 
2941 
     | 
    
         
             
                    llm_kv = LLM_KV(llm_arch_from_string(arch_name));
         
     | 
| 
       2842 
2942 
     | 
    
         | 
| 
       2843 
     | 
    
         
            -
                     
     | 
| 
       2844 
     | 
    
         
            -
                     
     | 
| 
      
 2943 
     | 
    
         
            +
                    // Save tensors data offset of the main file.
         
     | 
| 
      
 2944 
     | 
    
         
            +
                    // For subsidiary files, `meta` tensor data offset must not be used,
         
     | 
| 
      
 2945 
     | 
    
         
            +
                    // so we build a unified tensors index for weights.
         
     | 
| 
      
 2946 
     | 
    
         
            +
                    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         
     | 
| 
      
 2947 
     | 
    
         
            +
                        weights.emplace_back(0, cur->name, meta, cur);
         
     | 
| 
      
 2948 
     | 
    
         
            +
                    }
         
     | 
| 
      
 2949 
     | 
    
         
            +
                    files.emplace_back(new llama_file(fname.c_str(), "rb"));
         
     | 
| 
      
 2950 
     | 
    
         
            +
                    contexts.emplace_back(ctx);
         
     | 
| 
      
 2951 
     | 
    
         
            +
             
     | 
| 
      
 2952 
     | 
    
         
            +
                    uint16_t n_split = 0;
         
     | 
| 
      
 2953 
     | 
    
         
            +
                    get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
         
     | 
| 
      
 2954 
     | 
    
         
            +
             
     | 
| 
      
 2955 
     | 
    
         
            +
                    // Load additional GGML contexts
         
     | 
| 
      
 2956 
     | 
    
         
            +
                    if (n_split > 1) {
         
     | 
| 
      
 2957 
     | 
    
         
            +
                        uint16_t idx = 0;
         
     | 
| 
      
 2958 
     | 
    
         
            +
                        get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
         
     | 
| 
      
 2959 
     | 
    
         
            +
                        if (idx != 0) {
         
     | 
| 
      
 2960 
     | 
    
         
            +
                            throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
         
     | 
| 
      
 2961 
     | 
    
         
            +
                        }
         
     | 
| 
      
 2962 
     | 
    
         
            +
             
     | 
| 
      
 2963 
     | 
    
         
            +
                        char split_prefix[PATH_MAX] = {0};
         
     | 
| 
      
 2964 
     | 
    
         
            +
                        if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
         
     | 
| 
      
 2965 
     | 
    
         
            +
                            throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
         
     | 
| 
      
 2966 
     | 
    
         
            +
                        }
         
     | 
| 
      
 2967 
     | 
    
         
            +
             
     | 
| 
      
 2968 
     | 
    
         
            +
                        if (trace > 0) {
         
     | 
| 
      
 2969 
     | 
    
         
            +
                            LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
         
     | 
| 
      
 2970 
     | 
    
         
            +
                        }
         
     | 
| 
      
 2971 
     | 
    
         
            +
             
     | 
| 
      
 2972 
     | 
    
         
            +
                        char split_path[PATH_MAX] = {0};
         
     | 
| 
      
 2973 
     | 
    
         
            +
                        for (idx = 1; idx < n_split; idx++) {
         
     | 
| 
      
 2974 
     | 
    
         
            +
                            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
         
     | 
| 
      
 2975 
     | 
    
         
            +
             
     | 
| 
      
 2976 
     | 
    
         
            +
                            struct gguf_init_params split_params = {
         
     | 
| 
      
 2977 
     | 
    
         
            +
                                /*.no_alloc = */ true,
         
     | 
| 
      
 2978 
     | 
    
         
            +
                                /*.ctx      = */ &ctx,
         
     | 
| 
      
 2979 
     | 
    
         
            +
                            };
         
     | 
| 
      
 2980 
     | 
    
         
            +
                            struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
         
     | 
| 
      
 2981 
     | 
    
         
            +
                            if (!ctx_gguf) {
         
     | 
| 
      
 2982 
     | 
    
         
            +
                                throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
         
     | 
| 
      
 2983 
     | 
    
         
            +
                            }
         
     | 
| 
      
 2984 
     | 
    
         
            +
             
     | 
| 
      
 2985 
     | 
    
         
            +
                            // Save tensors data offset info of the shard.
         
     | 
| 
      
 2986 
     | 
    
         
            +
                            for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         
     | 
| 
      
 2987 
     | 
    
         
            +
                                weights.emplace_back(idx, cur->name, ctx_gguf, cur);
         
     | 
| 
      
 2988 
     | 
    
         
            +
                            }
         
     | 
| 
      
 2989 
     | 
    
         
            +
                            files.emplace_back(new llama_file(split_path, "rb"));
         
     | 
| 
      
 2990 
     | 
    
         
            +
                            contexts.emplace_back(ctx);
         
     | 
| 
      
 2991 
     | 
    
         
            +
             
     | 
| 
      
 2992 
     | 
    
         
            +
                            gguf_free(ctx_gguf);
         
     | 
| 
      
 2993 
     | 
    
         
            +
                        }
         
     | 
| 
      
 2994 
     | 
    
         
            +
             
     | 
| 
      
 2995 
     | 
    
         
            +
                        get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
         
     | 
| 
       2845 
2996 
     | 
    
         | 
| 
       2846 
     | 
    
         
            -
             
     | 
| 
      
 2997 
     | 
    
         
            +
                        // sanity check
         
     | 
| 
      
 2998 
     | 
    
         
            +
                        {
         
     | 
| 
      
 2999 
     | 
    
         
            +
                            const int n_tensors_loaded = (int) weights.size();
         
     | 
| 
      
 3000 
     | 
    
         
            +
                            if (n_tensors != n_tensors_loaded) {
         
     | 
| 
      
 3001 
     | 
    
         
            +
                                throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
         
     | 
| 
      
 3002 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3003 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3004 
     | 
    
         
            +
             
     | 
| 
      
 3005 
     | 
    
         
            +
                        LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         
     | 
| 
      
 3006 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3007 
     | 
    
         
            +
             
     | 
| 
      
 3008 
     | 
    
         
            +
                    n_kv      = gguf_get_n_kv(meta);
         
     | 
| 
      
 3009 
     | 
    
         
            +
                    n_tensors = weights.size();
         
     | 
| 
       2847 
3010 
     | 
    
         | 
| 
       2848 
     | 
    
         
            -
                     
     | 
| 
       2849 
     | 
    
         
            -
             
     | 
| 
       2850 
     | 
    
         
            -
             
     | 
| 
       2851 
     | 
    
         
            -
                        n_elements += ggml_nelements( 
     | 
| 
       2852 
     | 
    
         
            -
                        n_bytes    += ggml_nbytes( 
     | 
| 
      
 3011 
     | 
    
         
            +
                    fver = (enum llama_fver) gguf_get_version(meta);
         
     | 
| 
      
 3012 
     | 
    
         
            +
             
     | 
| 
      
 3013 
     | 
    
         
            +
                    for (auto & w : weights) {
         
     | 
| 
      
 3014 
     | 
    
         
            +
                        n_elements += ggml_nelements(w.tensor);
         
     | 
| 
      
 3015 
     | 
    
         
            +
                        n_bytes    += ggml_nbytes(w.tensor);
         
     | 
| 
       2853 
3016 
     | 
    
         
             
                    }
         
     | 
| 
       2854 
3017 
     | 
    
         | 
| 
       2855 
3018 
     | 
    
         
             
                    LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
         
     | 
| 
         @@ -2864,7 +3027,8 @@ struct llama_model_loader { 
     | 
|
| 
       2864 
3027 
     | 
    
         
             
                        enum ggml_type type_max = GGML_TYPE_F32;
         
     | 
| 
       2865 
3028 
     | 
    
         | 
| 
       2866 
3029 
     | 
    
         
             
                        for (int i = 0; i < n_tensors; i++) {
         
     | 
| 
       2867 
     | 
    
         
            -
                             
     | 
| 
      
 3030 
     | 
    
         
            +
                            const ggml_tensor * tensor = weights.at(i).tensor;
         
     | 
| 
      
 3031 
     | 
    
         
            +
                            enum ggml_type type = tensor->type;
         
     | 
| 
       2868 
3032 
     | 
    
         | 
| 
       2869 
3033 
     | 
    
         
             
                            n_type[type]++;
         
     | 
| 
       2870 
3034 
     | 
    
         | 
| 
         @@ -2874,8 +3038,8 @@ struct llama_model_loader { 
     | 
|
| 
       2874 
3038 
     | 
    
         
             
                            }
         
     | 
| 
       2875 
3039 
     | 
    
         | 
| 
       2876 
3040 
     | 
    
         
             
                            if (trace > 0) {
         
     | 
| 
       2877 
     | 
    
         
            -
                                 
     | 
| 
       2878 
     | 
    
         
            -
                                LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name( 
     | 
| 
      
 3041 
     | 
    
         
            +
                                const uint16_t sid = weights.at(i).idx;
         
     | 
| 
      
 3042 
     | 
    
         
            +
                                LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
         
     | 
| 
       2879 
3043 
     | 
    
         
             
                            }
         
     | 
| 
       2880 
3044 
     | 
    
         
             
                        }
         
     | 
| 
       2881 
3045 
     | 
    
         | 
| 
         @@ -2897,6 +3061,7 @@ struct llama_model_loader { 
     | 
|
| 
       2897 
3061 
     | 
    
         
             
                            case GGML_TYPE_IQ2_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_S;   break;
         
     | 
| 
       2898 
3062 
     | 
    
         
             
                            case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
         
     | 
| 
       2899 
3063 
     | 
    
         
             
                            case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
         
     | 
| 
      
 3064 
     | 
    
         
            +
                            case GGML_TYPE_IQ1_M:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_M;   break;
         
     | 
| 
       2900 
3065 
     | 
    
         
             
                            case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
         
     | 
| 
       2901 
3066 
     | 
    
         
             
                            case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
         
     | 
| 
       2902 
3067 
     | 
    
         
             
                            case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
         
     | 
| 
         @@ -2911,22 +3076,23 @@ struct llama_model_loader { 
     | 
|
| 
       2911 
3076 
     | 
    
         
             
                        ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
         
     | 
| 
       2912 
3077 
     | 
    
         | 
| 
       2913 
3078 
     | 
    
         
             
                        {
         
     | 
| 
       2914 
     | 
    
         
            -
                            const int kid = gguf_find_key( 
     | 
| 
      
 3079 
     | 
    
         
            +
                            const int kid = gguf_find_key(meta, "general.file_type");
         
     | 
| 
       2915 
3080 
     | 
    
         
             
                            if (kid >= 0) {
         
     | 
| 
       2916 
     | 
    
         
            -
                                ftype = (llama_ftype) gguf_get_val_u32( 
     | 
| 
      
 3081 
     | 
    
         
            +
                                ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
         
     | 
| 
       2917 
3082 
     | 
    
         
             
                            }
         
     | 
| 
       2918 
3083 
     | 
    
         
             
                        }
         
     | 
| 
       2919 
3084 
     | 
    
         | 
| 
       2920 
3085 
     | 
    
         
             
                        LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
         
     | 
| 
      
 3086 
     | 
    
         
            +
             
     | 
| 
       2921 
3087 
     | 
    
         
             
                        for (int i = 0; i < n_kv; i++) {
         
     | 
| 
       2922 
     | 
    
         
            -
                            const char * name           = gguf_get_key( 
     | 
| 
       2923 
     | 
    
         
            -
                            const enum gguf_type type   = gguf_get_kv_type( 
     | 
| 
      
 3088 
     | 
    
         
            +
                            const char * name           = gguf_get_key(meta, i);
         
     | 
| 
      
 3089 
     | 
    
         
            +
                            const enum gguf_type type   = gguf_get_kv_type(meta, i);
         
     | 
| 
       2924 
3090 
     | 
    
         
             
                            const std::string type_name =
         
     | 
| 
       2925 
3091 
     | 
    
         
             
                                type == GGUF_TYPE_ARRAY
         
     | 
| 
       2926 
     | 
    
         
            -
                                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type( 
     | 
| 
      
 3092 
     | 
    
         
            +
                                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
         
     | 
| 
       2927 
3093 
     | 
    
         
             
                                : gguf_type_name(type);
         
     | 
| 
       2928 
3094 
     | 
    
         | 
| 
       2929 
     | 
    
         
            -
                            std::string value          = gguf_kv_to_str( 
     | 
| 
      
 3095 
     | 
    
         
            +
                            std::string value          = gguf_kv_to_str(meta, i);
         
     | 
| 
       2930 
3096 
     | 
    
         
             
                            const size_t MAX_VALUE_LEN = 40;
         
     | 
| 
       2931 
3097 
     | 
    
         
             
                            if (value.size() > MAX_VALUE_LEN) {
         
     | 
| 
       2932 
3098 
     | 
    
         
             
                                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
         
     | 
| 
         @@ -2955,18 +3121,18 @@ struct llama_model_loader { 
     | 
|
| 
       2955 
3121 
     | 
    
         
             
                }
         
     | 
| 
       2956 
3122 
     | 
    
         | 
| 
       2957 
3123 
     | 
    
         
             
                ~llama_model_loader() {
         
     | 
| 
       2958 
     | 
    
         
            -
                    if ( 
     | 
| 
       2959 
     | 
    
         
            -
                        gguf_free( 
     | 
| 
      
 3124 
     | 
    
         
            +
                    if (meta) {
         
     | 
| 
      
 3125 
     | 
    
         
            +
                        gguf_free(meta);
         
     | 
| 
       2960 
3126 
     | 
    
         
             
                    }
         
     | 
| 
       2961 
     | 
    
         
            -
                     
     | 
| 
       2962 
     | 
    
         
            -
                        ggml_free( 
     | 
| 
      
 3127 
     | 
    
         
            +
                    for (auto * ctx : contexts) {
         
     | 
| 
      
 3128 
     | 
    
         
            +
                        ggml_free(ctx);
         
     | 
| 
       2963 
3129 
     | 
    
         
             
                    }
         
     | 
| 
       2964 
3130 
     | 
    
         
             
                }
         
     | 
| 
       2965 
3131 
     | 
    
         | 
| 
       2966 
3132 
     | 
    
         
             
                template<typename T>
         
     | 
| 
       2967 
3133 
     | 
    
         
             
                typename std::enable_if<std::is_integral<T>::value, bool>::type
         
     | 
| 
       2968 
3134 
     | 
    
         
             
                get_arr_n(const std::string & key, T & result, const bool required = true) {
         
     | 
| 
       2969 
     | 
    
         
            -
                    const int kid = gguf_find_key( 
     | 
| 
      
 3135 
     | 
    
         
            +
                    const int kid = gguf_find_key(meta, key.c_str());
         
     | 
| 
       2970 
3136 
     | 
    
         | 
| 
       2971 
3137 
     | 
    
         
             
                    if (kid < 0) {
         
     | 
| 
       2972 
3138 
     | 
    
         
             
                        if (required) {
         
     | 
| 
         @@ -2976,7 +3142,7 @@ struct llama_model_loader { 
     | 
|
| 
       2976 
3142 
     | 
    
         
             
                    }
         
     | 
| 
       2977 
3143 
     | 
    
         | 
| 
       2978 
3144 
     | 
    
         
             
                    struct GGUFMeta::ArrayInfo arr_info =
         
     | 
| 
       2979 
     | 
    
         
            -
                        GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv( 
     | 
| 
      
 3145 
     | 
    
         
            +
                        GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
         
     | 
| 
       2980 
3146 
     | 
    
         | 
| 
       2981 
3147 
     | 
    
         | 
| 
       2982 
3148 
     | 
    
         
             
                    result = arr_info.length;
         
     | 
| 
         @@ -2996,7 +3162,7 @@ struct llama_model_loader { 
     | 
|
| 
       2996 
3162 
     | 
    
         
             
                    const struct llama_model_kv_override * override =
         
     | 
| 
       2997 
3163 
     | 
    
         
             
                        it != kv_overrides.end() ? &it->second : nullptr;
         
     | 
| 
       2998 
3164 
     | 
    
         | 
| 
       2999 
     | 
    
         
            -
                    const bool found = GGUFMeta::GKV<T>::set( 
     | 
| 
      
 3165 
     | 
    
         
            +
                    const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
         
     | 
| 
       3000 
3166 
     | 
    
         | 
| 
       3001 
3167 
     | 
    
         
             
                    if (required && !found) {
         
     | 
| 
       3002 
3168 
     | 
    
         
             
                        throw std::runtime_error(format("key not found in model: %s", key.c_str()));
         
     | 
| 
         @@ -3019,28 +3185,57 @@ struct llama_model_loader { 
     | 
|
| 
       3019 
3185 
     | 
    
         
             
                }
         
     | 
| 
       3020 
3186 
     | 
    
         | 
| 
       3021 
3187 
     | 
    
         
             
                const char * get_tensor_name(int i) const {
         
     | 
| 
       3022 
     | 
    
         
            -
                    return  
     | 
| 
      
 3188 
     | 
    
         
            +
                    return weights.at(i).tensor->name;
         
     | 
| 
      
 3189 
     | 
    
         
            +
                }
         
     | 
| 
      
 3190 
     | 
    
         
            +
             
     | 
| 
      
 3191 
     | 
    
         
            +
                const llama_tensor_weight * get_weight(const char * name) const {
         
     | 
| 
      
 3192 
     | 
    
         
            +
                    for (const auto & weight : weights) {
         
     | 
| 
      
 3193 
     | 
    
         
            +
                        if (strcmp(name, weight.tensor->name) == 0) {
         
     | 
| 
      
 3194 
     | 
    
         
            +
                            return &weight;
         
     | 
| 
      
 3195 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3196 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3197 
     | 
    
         
            +
                    return nullptr;
         
     | 
| 
      
 3198 
     | 
    
         
            +
                }
         
     | 
| 
      
 3199 
     | 
    
         
            +
             
     | 
| 
      
 3200 
     | 
    
         
            +
                const llama_tensor_weight & require_weight(const char * name) const {
         
     | 
| 
      
 3201 
     | 
    
         
            +
                    const llama_tensor_weight * weight = get_weight(name);
         
     | 
| 
      
 3202 
     | 
    
         
            +
                    if (!weight) {
         
     | 
| 
      
 3203 
     | 
    
         
            +
                        throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
         
     | 
| 
      
 3204 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3205 
     | 
    
         
            +
                    return *weight;
         
     | 
| 
       3023 
3206 
     | 
    
         
             
                }
         
     | 
| 
       3024 
3207 
     | 
    
         | 
| 
       3025 
3208 
     | 
    
         
             
                struct ggml_tensor * get_tensor_meta(const char * name) const {
         
     | 
| 
       3026 
     | 
    
         
            -
                     
     | 
| 
      
 3209 
     | 
    
         
            +
                    const auto * weight = get_weight(name);
         
     | 
| 
      
 3210 
     | 
    
         
            +
                    if (!weight) {
         
     | 
| 
      
 3211 
     | 
    
         
            +
                        return nullptr;
         
     | 
| 
      
 3212 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3213 
     | 
    
         
            +
                    return weight->tensor;
         
     | 
| 
      
 3214 
     | 
    
         
            +
                }
         
     | 
| 
      
 3215 
     | 
    
         
            +
             
     | 
| 
      
 3216 
     | 
    
         
            +
                struct ggml_tensor * require_tensor_meta(const char * name) const {
         
     | 
| 
      
 3217 
     | 
    
         
            +
                    struct ggml_tensor * tensor = get_tensor_meta(name);
         
     | 
| 
      
 3218 
     | 
    
         
            +
                    if (!tensor) {
         
     | 
| 
      
 3219 
     | 
    
         
            +
                        throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
         
     | 
| 
      
 3220 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3221 
     | 
    
         
            +
                    return tensor;
         
     | 
| 
       3027 
3222 
     | 
    
         
             
                }
         
     | 
| 
       3028 
3223 
     | 
    
         | 
| 
       3029 
3224 
     | 
    
         
             
                struct ggml_tensor * get_tensor_meta(int i) const {
         
     | 
| 
       3030 
3225 
     | 
    
         
             
                    return get_tensor_meta(get_tensor_name(i));
         
     | 
| 
       3031 
3226 
     | 
    
         
             
                }
         
     | 
| 
       3032 
3227 
     | 
    
         | 
| 
       3033 
     | 
    
         
            -
                struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor *  
     | 
| 
       3034 
     | 
    
         
            -
                    struct ggml_tensor * tensor = ggml_dup_tensor(ctx,  
     | 
| 
       3035 
     | 
    
         
            -
                    ggml_set_name(tensor, ggml_get_name( 
     | 
| 
      
 3228 
     | 
    
         
            +
                struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
         
     | 
| 
      
 3229 
     | 
    
         
            +
                    struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
         
     | 
| 
      
 3230 
     | 
    
         
            +
                    ggml_set_name(tensor, ggml_get_name(cur));
         
     | 
| 
       3036 
3231 
     | 
    
         | 
| 
       3037 
3232 
     | 
    
         
             
                    n_created++;
         
     | 
| 
       3038 
3233 
     | 
    
         | 
| 
       3039 
3234 
     | 
    
         
             
                    return tensor;
         
     | 
| 
       3040 
3235 
     | 
    
         
             
                }
         
     | 
| 
       3041 
3236 
     | 
    
         | 
| 
       3042 
     | 
    
         
            -
                struct ggml_tensor *  
     | 
| 
       3043 
     | 
    
         
            -
                    struct ggml_tensor * cur =  
     | 
| 
      
 3237 
     | 
    
         
            +
                const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
         
     | 
| 
      
 3238 
     | 
    
         
            +
                    const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
         
     | 
| 
       3044 
3239 
     | 
    
         | 
| 
       3045 
3240 
     | 
    
         
             
                    if (cur == NULL) {
         
     | 
| 
       3046 
3241 
     | 
    
         
             
                        if (!required) {
         
     | 
| 
         @@ -3051,8 +3246,8 @@ struct llama_model_loader { 
     | 
|
| 
       3051 
3246 
     | 
    
         | 
| 
       3052 
3247 
     | 
    
         
             
                    {
         
     | 
| 
       3053 
3248 
     | 
    
         
             
                        bool is_ok = true;
         
     | 
| 
       3054 
     | 
    
         
            -
                        for (size_t i = 0; i <  
     | 
| 
       3055 
     | 
    
         
            -
                            if (ne[i] != cur->ne[i]) {
         
     | 
| 
      
 3249 
     | 
    
         
            +
                        for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
         
     | 
| 
      
 3250 
     | 
    
         
            +
                            if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
         
     | 
| 
       3056 
3251 
     | 
    
         
             
                                is_ok = false;
         
     | 
| 
       3057 
3252 
     | 
    
         
             
                                break;
         
     | 
| 
       3058 
3253 
     | 
    
         
             
                            }
         
     | 
| 
         @@ -3066,127 +3261,196 @@ struct llama_model_loader { 
     | 
|
| 
       3066 
3261 
     | 
    
         
             
                        }
         
     | 
| 
       3067 
3262 
     | 
    
         
             
                    }
         
     | 
| 
       3068 
3263 
     | 
    
         | 
| 
       3069 
     | 
    
         
            -
                    return  
     | 
| 
      
 3264 
     | 
    
         
            +
                    return cur;
         
     | 
| 
       3070 
3265 
     | 
    
         
             
                }
         
     | 
| 
       3071 
3266 
     | 
    
         | 
| 
       3072 
     | 
    
         
            -
                 
     | 
| 
       3073 
     | 
    
         
            -
                     
     | 
| 
       3074 
     | 
    
         
            -
             
     | 
| 
      
 3267 
     | 
    
         
            +
                struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
         
     | 
| 
      
 3268 
     | 
    
         
            +
                    const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
         
     | 
| 
      
 3269 
     | 
    
         
            +
             
     | 
| 
      
 3270 
     | 
    
         
            +
                    if (cur == NULL) {
         
     | 
| 
      
 3271 
     | 
    
         
            +
                        return NULL;
         
     | 
| 
       3075 
3272 
     | 
    
         
             
                    }
         
     | 
| 
      
 3273 
     | 
    
         
            +
             
     | 
| 
      
 3274 
     | 
    
         
            +
                    return create_tensor_for(ctx, cur);
         
     | 
| 
       3076 
3275 
     | 
    
         
             
                }
         
     | 
| 
       3077 
3276 
     | 
    
         | 
| 
       3078 
     | 
    
         
            -
                 
     | 
| 
       3079 
     | 
    
         
            -
                    const  
     | 
| 
      
 3277 
     | 
    
         
            +
                struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
         
     | 
| 
      
 3278 
     | 
    
         
            +
                    const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
         
     | 
| 
       3080 
3279 
     | 
    
         | 
| 
       3081 
     | 
    
         
            -
                    if ( 
     | 
| 
       3082 
     | 
    
         
            -
                         
     | 
| 
      
 3280 
     | 
    
         
            +
                    if (cur == NULL) {
         
     | 
| 
      
 3281 
     | 
    
         
            +
                        return NULL;
         
     | 
| 
       3083 
3282 
     | 
    
         
             
                    }
         
     | 
| 
       3084 
3283 
     | 
    
         | 
| 
       3085 
     | 
    
         
            -
                     
     | 
| 
       3086 
     | 
    
         
            -
             
     | 
| 
      
 3284 
     | 
    
         
            +
                    if (cur->type != base->type) {
         
     | 
| 
      
 3285 
     | 
    
         
            +
                        throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
         
     | 
| 
      
 3286 
     | 
    
         
            +
                    }
         
     | 
| 
       3087 
3287 
     | 
    
         | 
| 
       3088 
     | 
    
         
            -
             
     | 
| 
       3089 
     | 
    
         
            -
                     
     | 
| 
       3090 
     | 
    
         
            -
             
     | 
| 
       3091 
     | 
    
         
            -
                        mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
         
     | 
| 
      
 3288 
     | 
    
         
            +
                    std::array<int64_t, GGML_MAX_DIMS> dims;
         
     | 
| 
      
 3289 
     | 
    
         
            +
                    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
         
     | 
| 
      
 3290 
     | 
    
         
            +
                        dims[i] = i < ne.size() ? ne[i] : 1;
         
     | 
| 
       3092 
3291 
     | 
    
         
             
                    }
         
     | 
| 
       3093 
3292 
     | 
    
         | 
| 
       3094 
     | 
    
         
            -
                     
     | 
| 
       3095 
     | 
    
         
            -
             
     | 
| 
       3096 
     | 
    
         
            -
             
     | 
| 
       3097 
     | 
    
         
            -
             
     | 
| 
      
 3293 
     | 
    
         
            +
                    struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
         
     | 
| 
      
 3294 
     | 
    
         
            +
                                                    dims[0], dims[1], dims[2], dims[3],
         
     | 
| 
      
 3295 
     | 
    
         
            +
                                                    cur->nb[1], cur->nb[2], cur->nb[3],
         
     | 
| 
      
 3296 
     | 
    
         
            +
                                                    offset);
         
     | 
| 
      
 3297 
     | 
    
         
            +
             
     | 
| 
      
 3298 
     | 
    
         
            +
                    ggml_set_name(tensor, name.c_str());
         
     | 
| 
      
 3299 
     | 
    
         
            +
             
     | 
| 
      
 3300 
     | 
    
         
            +
                    n_created++;
         
     | 
| 
      
 3301 
     | 
    
         
            +
             
     | 
| 
      
 3302 
     | 
    
         
            +
                    return tensor;
         
     | 
| 
      
 3303 
     | 
    
         
            +
                }
         
     | 
| 
      
 3304 
     | 
    
         
            +
             
     | 
| 
      
 3305 
     | 
    
         
            +
                void done_getting_tensors() const {
         
     | 
| 
      
 3306 
     | 
    
         
            +
                    if (n_created != n_tensors) {
         
     | 
| 
      
 3307 
     | 
    
         
            +
                        throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         
     | 
| 
       3098 
3308 
     | 
    
         
             
                    }
         
     | 
| 
      
 3309 
     | 
    
         
            +
                }
         
     | 
| 
       3099 
3310 
     | 
    
         | 
| 
       3100 
     | 
    
         
            -
             
     | 
| 
       3101 
     | 
    
         
            -
             
     | 
| 
       3102 
     | 
    
         
            -
             
     | 
| 
      
 3311 
     | 
    
         
            +
                void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
         
     | 
| 
      
 3312 
     | 
    
         
            +
                    if (use_mmap) {
         
     | 
| 
      
 3313 
     | 
    
         
            +
                        mappings.reserve(files.size());
         
     | 
| 
      
 3314 
     | 
    
         
            +
                        mmaps_used.reserve(files.size());
         
     | 
| 
      
 3315 
     | 
    
         
            +
                        for (const auto & file : files) {
         
     | 
| 
      
 3316 
     | 
    
         
            +
                            std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
         
     | 
| 
      
 3317 
     | 
    
         
            +
                            mmaps_used.emplace_back(mapping->size, 0);
         
     | 
| 
      
 3318 
     | 
    
         
            +
                            if (mlock_mmaps) {
         
     | 
| 
      
 3319 
     | 
    
         
            +
                                std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
         
     | 
| 
      
 3320 
     | 
    
         
            +
                                mlock_mmap->init(mapping->addr);
         
     | 
| 
      
 3321 
     | 
    
         
            +
                                mlock_mmaps->emplace_back(std::move(mlock_mmap));
         
     | 
| 
      
 3322 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3323 
     | 
    
         
            +
                            mappings.emplace_back(std::move(mapping));
         
     | 
| 
       3103 
3324 
     | 
    
         
             
                        }
         
     | 
| 
       3104 
     | 
    
         
            -
             
     | 
| 
      
 3325 
     | 
    
         
            +
                    }
         
     | 
| 
      
 3326 
     | 
    
         
            +
             
     | 
| 
      
 3327 
     | 
    
         
            +
                    // compute the total size of all tensors for progress reporting
         
     | 
| 
      
 3328 
     | 
    
         
            +
                    for (auto & w : weights) {
         
     | 
| 
      
 3329 
     | 
    
         
            +
                        size_data += ggml_nbytes(w.tensor);
         
     | 
| 
       3105 
3330 
     | 
    
         
             
                    }
         
     | 
| 
       3106 
3331 
     | 
    
         
             
                }
         
     | 
| 
       3107 
3332 
     | 
    
         | 
| 
       3108 
     | 
    
         
            -
                void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
         
     | 
| 
       3109 
     | 
    
         
            -
                    GGML_ASSERT( 
     | 
| 
      
 3333 
     | 
    
         
            +
                void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
         
     | 
| 
      
 3334 
     | 
    
         
            +
                    GGML_ASSERT(!mappings.empty());
         
     | 
| 
      
 3335 
     | 
    
         
            +
                    const auto & mapping = mappings.at(idx);
         
     | 
| 
       3110 
3336 
     | 
    
         | 
| 
       3111 
3337 
     | 
    
         
             
                    *first = mapping->size;
         
     | 
| 
       3112 
3338 
     | 
    
         
             
                    *last  = 0;
         
     | 
| 
      
 3339 
     | 
    
         
            +
                    *addr = mapping->addr;
         
     | 
| 
       3113 
3340 
     | 
    
         
             
                    for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
         
     | 
| 
       3114 
     | 
    
         
            -
                         
     | 
| 
       3115 
     | 
    
         
            -
             
     | 
| 
       3116 
     | 
    
         
            -
             
     | 
| 
      
 3341 
     | 
    
         
            +
                        try {
         
     | 
| 
      
 3342 
     | 
    
         
            +
                            const auto * weight = get_weight(ggml_get_name(tensor));
         
     | 
| 
      
 3343 
     | 
    
         
            +
                            if (!weight) {
         
     | 
| 
      
 3344 
     | 
    
         
            +
                                continue;
         
     | 
| 
      
 3345 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3346 
     | 
    
         
            +
                            if (weight->idx != idx) {
         
     | 
| 
      
 3347 
     | 
    
         
            +
                                continue;
         
     | 
| 
      
 3348 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3349 
     | 
    
         
            +
                            *first = std::min(*first, weight->offs);
         
     | 
| 
      
 3350 
     | 
    
         
            +
                            *last  = std::max(*last,  weight->offs + ggml_nbytes(tensor));
         
     | 
| 
      
 3351 
     | 
    
         
            +
                        } catch(...) {
         
     | 
| 
      
 3352 
     | 
    
         
            +
                            // the tensor is not in the model
         
     | 
| 
      
 3353 
     | 
    
         
            +
                        }
         
     | 
| 
       3117 
3354 
     | 
    
         
             
                    }
         
     | 
| 
       3118 
3355 
     | 
    
         
             
                }
         
     | 
| 
       3119 
3356 
     | 
    
         | 
| 
       3120 
3357 
     | 
    
         
             
                // for backwards compatibility, does not support ggml-backend
         
     | 
| 
       3121 
3358 
     | 
    
         
             
                void load_data_for(struct ggml_tensor * cur) const {
         
     | 
| 
       3122 
     | 
    
         
            -
                    const  
     | 
| 
      
 3359 
     | 
    
         
            +
                    const auto & w = require_weight(ggml_get_name(cur));
         
     | 
| 
       3123 
3360 
     | 
    
         | 
| 
       3124 
     | 
    
         
            -
                    if (use_mmap 
     | 
| 
      
 3361 
     | 
    
         
            +
                    if (use_mmap) {
         
     | 
| 
      
 3362 
     | 
    
         
            +
                        const auto & mapping = mappings.at(w.idx);
         
     | 
| 
       3125 
3363 
     | 
    
         
             
                        if (cur->data == nullptr) {
         
     | 
| 
       3126 
     | 
    
         
            -
                            cur->data = (uint8_t *)mapping->addr + offs;
         
     | 
| 
      
 3364 
     | 
    
         
            +
                            cur->data = (uint8_t *)mapping->addr + w.offs;
         
     | 
| 
       3127 
3365 
     | 
    
         
             
                        } else {
         
     | 
| 
       3128 
     | 
    
         
            -
                            memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
         
     | 
| 
      
 3366 
     | 
    
         
            +
                            memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
         
     | 
| 
       3129 
3367 
     | 
    
         
             
                        }
         
     | 
| 
       3130 
3368 
     | 
    
         
             
                    } else {
         
     | 
| 
       3131 
3369 
     | 
    
         
             
                        GGML_ASSERT(cur->data != nullptr);
         
     | 
| 
       3132 
     | 
    
         
            -
                         
     | 
| 
       3133 
     | 
    
         
            -
                        file. 
     | 
| 
      
 3370 
     | 
    
         
            +
                        GGML_ASSERT(w.idx < files.size());
         
     | 
| 
      
 3371 
     | 
    
         
            +
                        const auto & file = files.at(w.idx);
         
     | 
| 
      
 3372 
     | 
    
         
            +
                        file->seek(w.offs, SEEK_SET);
         
     | 
| 
      
 3373 
     | 
    
         
            +
                        file->read_raw(cur->data, ggml_nbytes(cur));
         
     | 
| 
       3134 
3374 
     | 
    
         
             
                    }
         
     | 
| 
       3135 
3375 
     | 
    
         
             
                }
         
     | 
| 
       3136 
3376 
     | 
    
         | 
| 
       3137 
3377 
     | 
    
         
             
                size_t size_done = 0;
         
     | 
| 
       3138 
3378 
     | 
    
         
             
                size_t size_data = 0;
         
     | 
| 
       3139 
     | 
    
         
            -
                size_t  
     | 
| 
       3140 
     | 
    
         
            -
                size_t mmap_used_last  = 0;
         
     | 
| 
      
 3379 
     | 
    
         
            +
                std::vector<std::pair<size_t, size_t>> mmaps_used;
         
     | 
| 
       3141 
3380 
     | 
    
         | 
| 
       3142 
3381 
     | 
    
         
             
                // Returns false if cancelled by progress_callback
         
     | 
| 
       3143 
     | 
    
         
            -
                bool load_all_data( 
     | 
| 
       3144 
     | 
    
         
            -
             
     | 
| 
      
 3382 
     | 
    
         
            +
                bool load_all_data(
         
     | 
| 
      
 3383 
     | 
    
         
            +
                        struct ggml_context * ctx,
         
     | 
| 
      
 3384 
     | 
    
         
            +
                        llama_buf_map & bufs_mmap,
         
     | 
| 
      
 3385 
     | 
    
         
            +
                        llama_mlocks * lmlocks,
         
     | 
| 
      
 3386 
     | 
    
         
            +
                        llama_progress_callback progress_callback,
         
     | 
| 
      
 3387 
     | 
    
         
            +
                        void * progress_callback_user_data) {
         
     | 
| 
      
 3388 
     | 
    
         
            +
                    GGML_ASSERT(size_data != 0 && "call init_mappings() first");
         
     | 
| 
       3145 
3389 
     | 
    
         | 
| 
       3146 
3390 
     | 
    
         
             
                    std::vector<no_init<uint8_t>> read_buf;
         
     | 
| 
       3147 
     | 
    
         
            -
             
     | 
| 
       3148 
3391 
     | 
    
         
             
                    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
         
     | 
| 
      
 3392 
     | 
    
         
            +
                        const auto * weight = get_weight(ggml_get_name(cur));
         
     | 
| 
      
 3393 
     | 
    
         
            +
                        if (weight == nullptr) {
         
     | 
| 
      
 3394 
     | 
    
         
            +
                            // this can happen with split experts models
         
     | 
| 
      
 3395 
     | 
    
         
            +
                            continue;
         
     | 
| 
      
 3396 
     | 
    
         
            +
                        }
         
     | 
| 
      
 3397 
     | 
    
         
            +
             
     | 
| 
       3149 
3398 
     | 
    
         
             
                        if (progress_callback) {
         
     | 
| 
       3150 
3399 
     | 
    
         
             
                            if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
         
     | 
| 
       3151 
3400 
     | 
    
         
             
                                return false;
         
     | 
| 
       3152 
3401 
     | 
    
         
             
                            }
         
     | 
| 
       3153 
3402 
     | 
    
         
             
                        }
         
     | 
| 
       3154 
3403 
     | 
    
         | 
| 
       3155 
     | 
    
         
            -
                         
     | 
| 
      
 3404 
     | 
    
         
            +
                        size_t n_size = ggml_nbytes(cur);
         
     | 
| 
       3156 
3405 
     | 
    
         | 
| 
       3157 
     | 
    
         
            -
                        if (use_mmap 
     | 
| 
      
 3406 
     | 
    
         
            +
                        if (use_mmap) {
         
     | 
| 
      
 3407 
     | 
    
         
            +
                            const auto & mapping = mappings.at(weight->idx);
         
     | 
| 
      
 3408 
     | 
    
         
            +
                            ggml_backend_buffer_t buf_mmap = nullptr;
         
     | 
| 
      
 3409 
     | 
    
         
            +
                            if (bufs_mmap.count(weight->idx)) {
         
     | 
| 
      
 3410 
     | 
    
         
            +
                                buf_mmap = bufs_mmap.at(weight->idx);
         
     | 
| 
      
 3411 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3412 
     | 
    
         
            +
                            GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
         
     | 
| 
       3158 
3413 
     | 
    
         
             
                            if (buf_mmap && cur->data == nullptr) {
         
     | 
| 
       3159 
     | 
    
         
            -
                                ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
         
     | 
| 
       3160 
     | 
    
         
            -
                                if ( 
     | 
| 
       3161 
     | 
    
         
            -
                                    lmlock 
     | 
| 
      
 3414 
     | 
    
         
            +
                                ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
         
     | 
| 
      
 3415 
     | 
    
         
            +
                                if (lmlocks) {
         
     | 
| 
      
 3416 
     | 
    
         
            +
                                    const auto & lmlock = lmlocks->at(weight->idx);
         
     | 
| 
      
 3417 
     | 
    
         
            +
                                    lmlock->grow_to(weight->offs + ggml_nbytes(cur));
         
     | 
| 
       3162 
3418 
     | 
    
         
             
                                }
         
     | 
| 
       3163 
     | 
    
         
            -
             
     | 
| 
       3164 
     | 
    
         
            -
                                 
     | 
| 
      
 3419 
     | 
    
         
            +
             
     | 
| 
      
 3420 
     | 
    
         
            +
                                auto & mmap_used = mmaps_used[weight->idx];
         
     | 
| 
      
 3421 
     | 
    
         
            +
                                mmap_used.first  = std::min(mmap_used.first,  weight->offs);
         
     | 
| 
      
 3422 
     | 
    
         
            +
                                mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
         
     | 
| 
       3165 
3423 
     | 
    
         
             
                            } else {
         
     | 
| 
       3166 
     | 
    
         
            -
                                ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0,  
     | 
| 
      
 3424 
     | 
    
         
            +
                                ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
         
     | 
| 
       3167 
3425 
     | 
    
         
             
                            }
         
     | 
| 
       3168 
3426 
     | 
    
         
             
                        } else {
         
     | 
| 
      
 3427 
     | 
    
         
            +
                            GGML_ASSERT(weight->idx < files.size());
         
     | 
| 
      
 3428 
     | 
    
         
            +
                            const auto & file = files.at(weight->idx);
         
     | 
| 
       3169 
3429 
     | 
    
         
             
                            if (ggml_backend_buffer_is_host(cur->buffer)) {
         
     | 
| 
       3170 
     | 
    
         
            -
                                file 
     | 
| 
       3171 
     | 
    
         
            -
                                file 
     | 
| 
      
 3430 
     | 
    
         
            +
                                file->seek(weight->offs, SEEK_SET);
         
     | 
| 
      
 3431 
     | 
    
         
            +
                                file->read_raw(cur->data, ggml_nbytes(cur));
         
     | 
| 
       3172 
3432 
     | 
    
         
             
                            } else {
         
     | 
| 
       3173 
3433 
     | 
    
         
             
                                read_buf.resize(ggml_nbytes(cur));
         
     | 
| 
       3174 
     | 
    
         
            -
                                file 
     | 
| 
       3175 
     | 
    
         
            -
                                file 
     | 
| 
       3176 
     | 
    
         
            -
                                ggml_backend_tensor_set(cur, read_buf.data(), 0,  
     | 
| 
      
 3434 
     | 
    
         
            +
                                file->seek(weight->offs, SEEK_SET);
         
     | 
| 
      
 3435 
     | 
    
         
            +
                                file->read_raw(read_buf.data(), ggml_nbytes(cur));
         
     | 
| 
      
 3436 
     | 
    
         
            +
                                ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
         
     | 
| 
       3177 
3437 
     | 
    
         
             
                            }
         
     | 
| 
       3178 
3438 
     | 
    
         
             
                        }
         
     | 
| 
       3179 
3439 
     | 
    
         | 
| 
       3180 
     | 
    
         
            -
                        size_done +=  
     | 
| 
      
 3440 
     | 
    
         
            +
                        size_done += n_size;
         
     | 
| 
       3181 
3441 
     | 
    
         
             
                    }
         
     | 
| 
       3182 
3442 
     | 
    
         | 
| 
       3183 
3443 
     | 
    
         
             
                    // check if this is the last call and do final cleanup
         
     | 
| 
       3184 
3444 
     | 
    
         
             
                    if (size_done >= size_data) {
         
     | 
| 
       3185 
3445 
     | 
    
         
             
                        // unmap offloaded tensors and metadata
         
     | 
| 
       3186 
     | 
    
         
            -
                        if (use_mmap 
     | 
| 
       3187 
     | 
    
         
            -
                             
     | 
| 
       3188 
     | 
    
         
            -
             
     | 
| 
       3189 
     | 
    
         
            -
                                mapping 
     | 
| 
      
 3446 
     | 
    
         
            +
                        if (use_mmap) {
         
     | 
| 
      
 3447 
     | 
    
         
            +
                            for (uint32_t idx = 0; idx < mappings.size(); idx++) {
         
     | 
| 
      
 3448 
     | 
    
         
            +
                                const auto & mmap_used = mmaps_used.at(idx);
         
     | 
| 
      
 3449 
     | 
    
         
            +
                                auto & mapping = mappings.at(idx);
         
     | 
| 
      
 3450 
     | 
    
         
            +
                                mapping->unmap_fragment(0, mmap_used.first);
         
     | 
| 
      
 3451 
     | 
    
         
            +
                                if (mmap_used.second != 0) {
         
     | 
| 
      
 3452 
     | 
    
         
            +
                                    mapping->unmap_fragment(mmap_used.second, mapping->size);
         
     | 
| 
      
 3453 
     | 
    
         
            +
                                }
         
     | 
| 
       3190 
3454 
     | 
    
         
             
                            }
         
     | 
| 
       3191 
3455 
     | 
    
         
             
                        }
         
     | 
| 
       3192 
3456 
     | 
    
         
             
                        if (progress_callback) {
         
     | 
| 
         @@ -3259,6 +3523,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { 
     | 
|
| 
       3259 
3523 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
         
     | 
| 
       3260 
3524 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
         
     | 
| 
       3261 
3525 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ1_S  :return "IQ1_S - 1.5625 bpw";
         
     | 
| 
      
 3526 
     | 
    
         
            +
                    case LLAMA_FTYPE_MOSTLY_IQ1_M  :return "IQ1_M - 1.75 bpw";
         
     | 
| 
       3262 
3527 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
         
     | 
| 
       3263 
3528 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
         
     | 
| 
       3264 
3529 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
         
     | 
| 
         @@ -3290,10 +3555,13 @@ static const char * llama_model_type_name(e_model type) { 
     | 
|
| 
       3290 
3555 
     | 
    
         
             
                    case MODEL_40B:    return "40B";
         
     | 
| 
       3291 
3556 
     | 
    
         
             
                    case MODEL_65B:    return "65B";
         
     | 
| 
       3292 
3557 
     | 
    
         
             
                    case MODEL_70B:    return "70B";
         
     | 
| 
      
 3558 
     | 
    
         
            +
                    case MODEL_314B:   return "314B";
         
     | 
| 
       3293 
3559 
     | 
    
         
             
                    case MODEL_SMALL:  return "0.1B";
         
     | 
| 
       3294 
3560 
     | 
    
         
             
                    case MODEL_MEDIUM: return "0.4B";
         
     | 
| 
       3295 
3561 
     | 
    
         
             
                    case MODEL_LARGE:  return "0.8B";
         
     | 
| 
       3296 
3562 
     | 
    
         
             
                    case MODEL_XL:     return "1.5B";
         
     | 
| 
      
 3563 
     | 
    
         
            +
                    case MODEL_8x7B:   return "8x7B";
         
     | 
| 
      
 3564 
     | 
    
         
            +
                    case MODEL_8x22B:  return "8x22B";
         
     | 
| 
       3297 
3565 
     | 
    
         
             
                    default:           return "?B";
         
     | 
| 
       3298 
3566 
     | 
    
         
             
                }
         
     | 
| 
       3299 
3567 
     | 
    
         
             
            }
         
     | 
| 
         @@ -3319,7 +3587,7 @@ static void llm_load_hparams( 
     | 
|
| 
       3319 
3587 
     | 
    
         
             
                    llama_model_loader & ml,
         
     | 
| 
       3320 
3588 
     | 
    
         
             
                    llama_model & model) {
         
     | 
| 
       3321 
3589 
     | 
    
         
             
                auto & hparams = model.hparams;
         
     | 
| 
       3322 
     | 
    
         
            -
                const gguf_context * ctx = ml. 
     | 
| 
      
 3590 
     | 
    
         
            +
                const gguf_context * ctx = ml.meta;
         
     | 
| 
       3323 
3591 
     | 
    
         | 
| 
       3324 
3592 
     | 
    
         
             
                // get metadata as string
         
     | 
| 
       3325 
3593 
     | 
    
         
             
                for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
         
     | 
| 
         @@ -3408,15 +3676,23 @@ static void llm_load_hparams( 
     | 
|
| 
       3408 
3676 
     | 
    
         
             
                        {
         
     | 
| 
       3409 
3677 
     | 
    
         
             
                            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
         
     | 
| 
       3410 
3678 
     | 
    
         | 
| 
       3411 
     | 
    
         
            -
                             
     | 
| 
       3412 
     | 
    
         
            -
                                 
     | 
| 
       3413 
     | 
    
         
            -
             
     | 
| 
       3414 
     | 
    
         
            -
             
     | 
| 
       3415 
     | 
    
         
            -
             
     | 
| 
       3416 
     | 
    
         
            -
                                 
     | 
| 
       3417 
     | 
    
         
            -
             
     | 
| 
       3418 
     | 
    
         
            -
                                 
     | 
| 
       3419 
     | 
    
         
            -
             
     | 
| 
      
 3679 
     | 
    
         
            +
                            if (hparams.n_expert == 8) {
         
     | 
| 
      
 3680 
     | 
    
         
            +
                                switch (hparams.n_layer) {
         
     | 
| 
      
 3681 
     | 
    
         
            +
                                    case 32: model.type = e_model::MODEL_8x7B; break;
         
     | 
| 
      
 3682 
     | 
    
         
            +
                                    case 56: model.type = e_model::MODEL_8x22B; break;
         
     | 
| 
      
 3683 
     | 
    
         
            +
                                    default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
      
 3684 
     | 
    
         
            +
                                }
         
     | 
| 
      
 3685 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 3686 
     | 
    
         
            +
                                switch (hparams.n_layer) {
         
     | 
| 
      
 3687 
     | 
    
         
            +
                                    case 22: model.type = e_model::MODEL_1B; break;
         
     | 
| 
      
 3688 
     | 
    
         
            +
                                    case 26: model.type = e_model::MODEL_3B; break;
         
     | 
| 
      
 3689 
     | 
    
         
            +
                                    case 32: model.type = e_model::MODEL_7B; break;
         
     | 
| 
      
 3690 
     | 
    
         
            +
                                    case 40: model.type = e_model::MODEL_13B; break;
         
     | 
| 
      
 3691 
     | 
    
         
            +
                                    case 48: model.type = e_model::MODEL_34B; break;
         
     | 
| 
      
 3692 
     | 
    
         
            +
                                    case 60: model.type = e_model::MODEL_30B; break;
         
     | 
| 
      
 3693 
     | 
    
         
            +
                                    case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
         
     | 
| 
      
 3694 
     | 
    
         
            +
                                    default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
      
 3695 
     | 
    
         
            +
                                }
         
     | 
| 
       3420 
3696 
     | 
    
         
             
                            }
         
     | 
| 
       3421 
3697 
     | 
    
         
             
                        } break;
         
     | 
| 
       3422 
3698 
     | 
    
         
             
                    case LLM_ARCH_MINICPM:
         
     | 
| 
         @@ -3428,6 +3704,15 @@ static void llm_load_hparams( 
     | 
|
| 
       3428 
3704 
     | 
    
         
             
                                default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
       3429 
3705 
     | 
    
         
             
                            }
         
     | 
| 
       3430 
3706 
     | 
    
         
             
                        } break;
         
     | 
| 
      
 3707 
     | 
    
         
            +
                    case LLM_ARCH_GROK:
         
     | 
| 
      
 3708 
     | 
    
         
            +
                        {
         
     | 
| 
      
 3709 
     | 
    
         
            +
                            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
         
     | 
| 
      
 3710 
     | 
    
         
            +
             
     | 
| 
      
 3711 
     | 
    
         
            +
                            switch (hparams.n_layer) {
         
     | 
| 
      
 3712 
     | 
    
         
            +
                                case 64: model.type = e_model::MODEL_314B; break;
         
     | 
| 
      
 3713 
     | 
    
         
            +
                                default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
      
 3714 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3715 
     | 
    
         
            +
                        } break;
         
     | 
| 
       3431 
3716 
     | 
    
         
             
                    case LLM_ARCH_FALCON:
         
     | 
| 
       3432 
3717 
     | 
    
         
             
                        {
         
     | 
| 
       3433 
3718 
     | 
    
         
             
                            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
         
     | 
| 
         @@ -3679,6 +3964,16 @@ static void llm_load_hparams( 
     | 
|
| 
       3679 
3964 
     | 
    
         
             
                                default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
       3680 
3965 
     | 
    
         
             
                            }
         
     | 
| 
       3681 
3966 
     | 
    
         
             
                        } break;
         
     | 
| 
      
 3967 
     | 
    
         
            +
                    case LLM_ARCH_XVERSE:
         
     | 
| 
      
 3968 
     | 
    
         
            +
                        {
         
     | 
| 
      
 3969 
     | 
    
         
            +
                            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
         
     | 
| 
      
 3970 
     | 
    
         
            +
                            switch (hparams.n_layer) {
         
     | 
| 
      
 3971 
     | 
    
         
            +
                                case 32: model.type = e_model::MODEL_7B; break;
         
     | 
| 
      
 3972 
     | 
    
         
            +
                                case 40: model.type = e_model::MODEL_13B; break;
         
     | 
| 
      
 3973 
     | 
    
         
            +
                                case 80: model.type = e_model::MODEL_65B; break;
         
     | 
| 
      
 3974 
     | 
    
         
            +
                                default: model.type = e_model::MODEL_UNKNOWN;
         
     | 
| 
      
 3975 
     | 
    
         
            +
                            }
         
     | 
| 
      
 3976 
     | 
    
         
            +
                        } break;
         
     | 
| 
       3682 
3977 
     | 
    
         
             
                    case LLM_ARCH_COMMAND_R:
         
     | 
| 
       3683 
3978 
     | 
    
         
             
                        {
         
     | 
| 
       3684 
3979 
     | 
    
         
             
                            ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
         
     | 
| 
         @@ -3701,7 +3996,9 @@ static void llm_load_hparams( 
     | 
|
| 
       3701 
3996 
     | 
    
         
             
            }
         
     | 
| 
       3702 
3997 
     | 
    
         | 
| 
       3703 
3998 
     | 
    
         
             
            // TODO: This should probably be in llama.h
         
     | 
| 
       3704 
     | 
    
         
            -
            static std::vector<llama_vocab::id> llama_tokenize_internal( 
     | 
| 
      
 3999 
     | 
    
         
            +
            static std::vector<llama_vocab::id> llama_tokenize_internal(
         
     | 
| 
      
 4000 
     | 
    
         
            +
                const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
         
     | 
| 
      
 4001 
     | 
    
         
            +
            );
         
     | 
| 
       3705 
4002 
     | 
    
         
             
            static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
         
     | 
| 
       3706 
4003 
     | 
    
         | 
| 
       3707 
4004 
     | 
    
         
             
            static void llm_load_vocab(
         
     | 
| 
         @@ -3709,7 +4006,7 @@ static void llm_load_vocab( 
     | 
|
| 
       3709 
4006 
     | 
    
         
             
                    llama_model & model) {
         
     | 
| 
       3710 
4007 
     | 
    
         
             
                auto & vocab = model.vocab;
         
     | 
| 
       3711 
4008 
     | 
    
         | 
| 
       3712 
     | 
    
         
            -
                struct gguf_context * ctx = ml. 
     | 
| 
      
 4009 
     | 
    
         
            +
                struct gguf_context * ctx = ml.meta;
         
     | 
| 
       3713 
4010 
     | 
    
         | 
| 
       3714 
4011 
     | 
    
         
             
                const auto kv = LLM_KV(model.arch);
         
     | 
| 
       3715 
4012 
     | 
    
         | 
| 
         @@ -3723,23 +4020,27 @@ static void llm_load_vocab( 
     | 
|
| 
       3723 
4020 
     | 
    
         
             
                        vocab.type = LLAMA_VOCAB_TYPE_NONE;
         
     | 
| 
       3724 
4021 
     | 
    
         | 
| 
       3725 
4022 
     | 
    
         
             
                        // default special tokens
         
     | 
| 
       3726 
     | 
    
         
            -
                        vocab.special_bos_id 
     | 
| 
       3727 
     | 
    
         
            -
                        vocab.special_eos_id 
     | 
| 
       3728 
     | 
    
         
            -
                        vocab.special_unk_id 
     | 
| 
       3729 
     | 
    
         
            -
                        vocab.special_sep_id 
     | 
| 
       3730 
     | 
    
         
            -
                        vocab.special_pad_id 
     | 
| 
       3731 
     | 
    
         
            -
                        vocab. 
     | 
| 
      
 4023 
     | 
    
         
            +
                        vocab.special_bos_id  = -1;
         
     | 
| 
      
 4024 
     | 
    
         
            +
                        vocab.special_eos_id  = -1;
         
     | 
| 
      
 4025 
     | 
    
         
            +
                        vocab.special_unk_id  = -1;
         
     | 
| 
      
 4026 
     | 
    
         
            +
                        vocab.special_sep_id  = -1;
         
     | 
| 
      
 4027 
     | 
    
         
            +
                        vocab.special_pad_id  = -1;
         
     | 
| 
      
 4028 
     | 
    
         
            +
                        vocab.special_cls_id  = -1;
         
     | 
| 
      
 4029 
     | 
    
         
            +
                        vocab.special_mask_id = -1;
         
     | 
| 
      
 4030 
     | 
    
         
            +
                        vocab.linefeed_id     = -1;
         
     | 
| 
       3732 
4031 
     | 
    
         | 
| 
       3733 
4032 
     | 
    
         
             
                        return;
         
     | 
| 
       3734 
4033 
     | 
    
         
             
                    } else if (tokenizer_name == "llama") {
         
     | 
| 
       3735 
4034 
     | 
    
         
             
                        vocab.type = LLAMA_VOCAB_TYPE_SPM;
         
     | 
| 
       3736 
4035 
     | 
    
         | 
| 
       3737 
4036 
     | 
    
         
             
                        // default special tokens
         
     | 
| 
       3738 
     | 
    
         
            -
                        vocab.special_bos_id 
     | 
| 
       3739 
     | 
    
         
            -
                        vocab.special_eos_id 
     | 
| 
       3740 
     | 
    
         
            -
                        vocab.special_unk_id 
     | 
| 
       3741 
     | 
    
         
            -
                        vocab.special_sep_id 
     | 
| 
       3742 
     | 
    
         
            -
                        vocab.special_pad_id 
     | 
| 
      
 4037 
     | 
    
         
            +
                        vocab.special_bos_id  = 1;
         
     | 
| 
      
 4038 
     | 
    
         
            +
                        vocab.special_eos_id  = 2;
         
     | 
| 
      
 4039 
     | 
    
         
            +
                        vocab.special_unk_id  = 0;
         
     | 
| 
      
 4040 
     | 
    
         
            +
                        vocab.special_sep_id  = -1;
         
     | 
| 
      
 4041 
     | 
    
         
            +
                        vocab.special_pad_id  = -1;
         
     | 
| 
      
 4042 
     | 
    
         
            +
                        vocab.special_cls_id  = -1;
         
     | 
| 
      
 4043 
     | 
    
         
            +
                        vocab.special_mask_id = -1;
         
     | 
| 
       3743 
4044 
     | 
    
         | 
| 
       3744 
4045 
     | 
    
         
             
                        const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
         
     | 
| 
       3745 
4046 
     | 
    
         
             
                        if (add_space_prefix_keyidx != -1) {
         
     | 
| 
         @@ -3774,20 +4075,24 @@ static void llm_load_vocab( 
     | 
|
| 
       3774 
4075 
     | 
    
         
             
                        }
         
     | 
| 
       3775 
4076 
     | 
    
         | 
| 
       3776 
4077 
     | 
    
         
             
                        // default special tokens
         
     | 
| 
       3777 
     | 
    
         
            -
                        vocab.special_bos_id 
     | 
| 
       3778 
     | 
    
         
            -
                        vocab.special_eos_id 
     | 
| 
       3779 
     | 
    
         
            -
                        vocab.special_unk_id 
     | 
| 
       3780 
     | 
    
         
            -
                        vocab.special_sep_id 
     | 
| 
       3781 
     | 
    
         
            -
                        vocab.special_pad_id 
     | 
| 
      
 4078 
     | 
    
         
            +
                        vocab.special_bos_id  = 11;
         
     | 
| 
      
 4079 
     | 
    
         
            +
                        vocab.special_eos_id  = 11;
         
     | 
| 
      
 4080 
     | 
    
         
            +
                        vocab.special_unk_id  = -1;
         
     | 
| 
      
 4081 
     | 
    
         
            +
                        vocab.special_sep_id  = -1;
         
     | 
| 
      
 4082 
     | 
    
         
            +
                        vocab.special_pad_id  = -1;
         
     | 
| 
      
 4083 
     | 
    
         
            +
                        vocab.special_cls_id  = -1;
         
     | 
| 
      
 4084 
     | 
    
         
            +
                        vocab.special_mask_id = -1;
         
     | 
| 
       3782 
4085 
     | 
    
         
             
                    } else if (tokenizer_name == "bert") {
         
     | 
| 
       3783 
4086 
     | 
    
         
             
                        vocab.type = LLAMA_VOCAB_TYPE_WPM;
         
     | 
| 
       3784 
4087 
     | 
    
         | 
| 
       3785 
4088 
     | 
    
         
             
                        // default special tokens
         
     | 
| 
       3786 
     | 
    
         
            -
                        vocab.special_bos_id 
     | 
| 
       3787 
     | 
    
         
            -
                        vocab.special_eos_id 
     | 
| 
       3788 
     | 
    
         
            -
                        vocab.special_unk_id 
     | 
| 
       3789 
     | 
    
         
            -
                        vocab.special_sep_id 
     | 
| 
       3790 
     | 
    
         
            -
                        vocab.special_pad_id 
     | 
| 
      
 4089 
     | 
    
         
            +
                        vocab.special_bos_id  = -1;
         
     | 
| 
      
 4090 
     | 
    
         
            +
                        vocab.special_eos_id  = -1;
         
     | 
| 
      
 4091 
     | 
    
         
            +
                        vocab.special_unk_id  = 100;
         
     | 
| 
      
 4092 
     | 
    
         
            +
                        vocab.special_sep_id  = 102;
         
     | 
| 
      
 4093 
     | 
    
         
            +
                        vocab.special_pad_id  = 0;
         
     | 
| 
      
 4094 
     | 
    
         
            +
                        vocab.special_cls_id  = 101;
         
     | 
| 
      
 4095 
     | 
    
         
            +
                        vocab.special_mask_id = 103;
         
     | 
| 
       3791 
4096 
     | 
    
         
             
                        vocab.add_space_prefix = false;
         
     | 
| 
       3792 
4097 
     | 
    
         
             
                    } else {
         
     | 
| 
       3793 
4098 
     | 
    
         
             
                        LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
         
     | 
| 
         @@ -3842,7 +4147,7 @@ static void llm_load_vocab( 
     | 
|
| 
       3842 
4147 
     | 
    
         
             
                } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
         
     | 
| 
       3843 
4148 
     | 
    
         
             
                    vocab.linefeed_id = vocab.special_pad_id;
         
     | 
| 
       3844 
4149 
     | 
    
         
             
                } else {
         
     | 
| 
       3845 
     | 
    
         
            -
                    const std::vector<int> ids = llama_tokenize_internal(vocab, "\ 
     | 
| 
      
 4150 
     | 
    
         
            +
                    const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
         
     | 
| 
       3846 
4151 
     | 
    
         
             
                    GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
         
     | 
| 
       3847 
4152 
     | 
    
         
             
                    vocab.linefeed_id = ids[0];
         
     | 
| 
       3848 
4153 
     | 
    
         
             
                }
         
     | 
| 
         @@ -3850,11 +4155,13 @@ static void llm_load_vocab( 
     | 
|
| 
       3850 
4155 
     | 
    
         
             
                // special tokens
         
     | 
| 
       3851 
4156 
     | 
    
         
             
                {
         
     | 
| 
       3852 
4157 
     | 
    
         
             
                    const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
         
     | 
| 
       3853 
     | 
    
         
            -
                        { LLM_KV_TOKENIZER_BOS_ID, 
     | 
| 
       3854 
     | 
    
         
            -
                        { LLM_KV_TOKENIZER_EOS_ID, 
     | 
| 
       3855 
     | 
    
         
            -
                        { LLM_KV_TOKENIZER_UNK_ID, 
     | 
| 
       3856 
     | 
    
         
            -
                        { LLM_KV_TOKENIZER_SEP_ID, 
     | 
| 
       3857 
     | 
    
         
            -
                        { LLM_KV_TOKENIZER_PAD_ID, 
     | 
| 
      
 4158 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_BOS_ID,  vocab.special_bos_id  },
         
     | 
| 
      
 4159 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_EOS_ID,  vocab.special_eos_id  },
         
     | 
| 
      
 4160 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_UNK_ID,  vocab.special_unk_id  },
         
     | 
| 
      
 4161 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_SEP_ID,  vocab.special_sep_id  },
         
     | 
| 
      
 4162 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_PAD_ID,  vocab.special_pad_id  },
         
     | 
| 
      
 4163 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_CLS_ID,  vocab.special_cls_id  },
         
     | 
| 
      
 4164 
     | 
    
         
            +
                        { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
         
     | 
| 
       3858 
4165 
     | 
    
         
             
                    };
         
     | 
| 
       3859 
4166 
     | 
    
         
             
                    for (const auto & it : special_token_types) {
         
     | 
| 
       3860 
4167 
     | 
    
         
             
                        const std::string & key = kv(std::get<0>(it));
         
     | 
| 
         @@ -4046,12 +4353,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { 
     | 
|
| 
       4046 
4353 
     | 
    
         
             
                LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
         
     | 
| 
       4047 
4354 
     | 
    
         | 
| 
       4048 
4355 
     | 
    
         
             
                // special tokens
         
     | 
| 
       4049 
     | 
    
         
            -
                if (vocab.special_bos_id 
     | 
| 
       4050 
     | 
    
         
            -
                if (vocab.special_eos_id 
     | 
| 
       4051 
     | 
    
         
            -
                if (vocab.special_unk_id 
     | 
| 
       4052 
     | 
    
         
            -
                if (vocab.special_sep_id 
     | 
| 
       4053 
     | 
    
         
            -
                if (vocab.special_pad_id 
     | 
| 
       4054 
     | 
    
         
            -
                if (vocab. 
     | 
| 
      
 4356 
     | 
    
         
            +
                if (vocab.special_bos_id  != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
         
     | 
| 
      
 4357 
     | 
    
         
            +
                if (vocab.special_eos_id  != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
         
     | 
| 
      
 4358 
     | 
    
         
            +
                if (vocab.special_unk_id  != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
         
     | 
| 
      
 4359 
     | 
    
         
            +
                if (vocab.special_sep_id  != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
         
     | 
| 
      
 4360 
     | 
    
         
            +
                if (vocab.special_pad_id  != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
         
     | 
| 
      
 4361 
     | 
    
         
            +
                if (vocab.special_cls_id  != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
         
     | 
| 
      
 4362 
     | 
    
         
            +
                if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
         
     | 
| 
      
 4363 
     | 
    
         
            +
                if (vocab.linefeed_id     != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,     vocab.id_to_token[vocab.linefeed_id].text.c_str() );     }
         
     | 
| 
       4055 
4364 
     | 
    
         
             
            }
         
     | 
| 
       4056 
4365 
     | 
    
         | 
| 
       4057 
4366 
     | 
    
         
             
            // Returns false if cancelled by progress_callback
         
     | 
| 
         @@ -4075,6 +4384,7 @@ static bool llm_load_tensors( 
     | 
|
| 
       4075 
4384 
     | 
    
         | 
| 
       4076 
4385 
     | 
    
         
             
                const int64_t n_layer     = hparams.n_layer;
         
     | 
| 
       4077 
4386 
     | 
    
         
             
                const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
         
     | 
| 
      
 4387 
     | 
    
         
            +
                bool use_mmap_buffer = true;
         
     | 
| 
       4078 
4388 
     | 
    
         | 
| 
       4079 
4389 
     | 
    
         
             
                // there is very little benefit to offloading the input layer, so always keep it on the CPU
         
     | 
| 
       4080 
4390 
     | 
    
         
             
                model.buft_input = llama_default_buffer_type_cpu(true);
         
     | 
| 
         @@ -4163,6 +4473,10 @@ static bool llm_load_tensors( 
     | 
|
| 
       4163 
4473 
     | 
    
         | 
| 
       4164 
4474 
     | 
    
         
             
                // create one context per buffer type
         
     | 
| 
       4165 
4475 
     | 
    
         
             
                size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
         
     | 
| 
      
 4476 
     | 
    
         
            +
             
     | 
| 
      
 4477 
     | 
    
         
            +
                // for moe merged tensors
         
     | 
| 
      
 4478 
     | 
    
         
            +
                ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
         
     | 
| 
      
 4479 
     | 
    
         
            +
             
     | 
| 
       4166 
4480 
     | 
    
         
             
                std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
         
     | 
| 
       4167 
4481 
     | 
    
         
             
                for (auto & it : buft_layer_count) {
         
     | 
| 
       4168 
4482 
     | 
    
         
             
                    struct ggml_init_params params = {
         
     | 
| 
         @@ -4189,6 +4503,11 @@ static bool llm_load_tensors( 
     | 
|
| 
       4189 
4503 
     | 
    
         
             
                    const int64_t n_vocab      = hparams.n_vocab;
         
     | 
| 
       4190 
4504 
     | 
    
         
             
                    const int64_t n_vocab_type = hparams.n_vocab_type;
         
     | 
| 
       4191 
4505 
     | 
    
         
             
                    const int64_t n_ff         = hparams.n_ff;
         
     | 
| 
      
 4506 
     | 
    
         
            +
                    const int64_t n_expert     = hparams.n_expert;
         
     | 
| 
      
 4507 
     | 
    
         
            +
             
     | 
| 
      
 4508 
     | 
    
         
            +
                    if (n_expert > 0 && hparams.n_expert_used == 0) {
         
     | 
| 
      
 4509 
     | 
    
         
            +
                        throw std::runtime_error("model has expert layers but no expert layers are used");
         
     | 
| 
      
 4510 
     | 
    
         
            +
                    }
         
     | 
| 
       4192 
4511 
     | 
    
         | 
| 
       4193 
4512 
     | 
    
         
             
                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
         
     | 
| 
       4194 
4513 
     | 
    
         | 
| 
         @@ -4243,26 +4562,113 @@ static bool llm_load_tensors( 
     | 
|
| 
       4243 
4562 
     | 
    
         | 
| 
       4244 
4563 
     | 
    
         
             
                                    layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
         
     | 
| 
       4245 
4564 
     | 
    
         | 
| 
       4246 
     | 
    
         
            -
                                     
     | 
| 
       4247 
     | 
    
         
            -
             
     | 
| 
       4248 
     | 
    
         
            -
                                    if (layer.ffn_gate_inp == nullptr) {
         
     | 
| 
       4249 
     | 
    
         
            -
                                        GGML_ASSERT(hparams.n_expert      == 0);
         
     | 
| 
       4250 
     | 
    
         
            -
                                        GGML_ASSERT(hparams.n_expert_used == 0);
         
     | 
| 
       4251 
     | 
    
         
            -
             
     | 
| 
      
 4565 
     | 
    
         
            +
                                    if (n_expert == 0) {
         
     | 
| 
       4252 
4566 
     | 
    
         
             
                                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
         
     | 
| 
       4253 
4567 
     | 
    
         
             
                                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
         
     | 
| 
       4254 
4568 
     | 
    
         
             
                                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
         
     | 
| 
       4255 
4569 
     | 
    
         
             
                                    } else {
         
     | 
| 
       4256 
     | 
    
         
            -
                                         
     | 
| 
       4257 
     | 
    
         
            -
             
     | 
| 
       4258 
     | 
    
         
            -
             
     | 
| 
       4259 
     | 
    
         
            -
                                         
     | 
| 
       4260 
     | 
    
         
            -
             
     | 
| 
       4261 
     | 
    
         
            -
                                            layer. 
     | 
| 
       4262 
     | 
    
         
            -
             
     | 
| 
       4263 
     | 
    
         
            -
                                             
     | 
| 
      
 4570 
     | 
    
         
            +
                                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
         
     | 
| 
      
 4571 
     | 
    
         
            +
             
     | 
| 
      
 4572 
     | 
    
         
            +
                                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
         
     | 
| 
      
 4573 
     | 
    
         
            +
                                        if (layer.ffn_gate_exps) {
         
     | 
| 
      
 4574 
     | 
    
         
            +
                                            layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
         
     | 
| 
      
 4575 
     | 
    
         
            +
                                            layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
         
     | 
| 
      
 4576 
     | 
    
         
            +
                                        } else {
         
     | 
| 
      
 4577 
     | 
    
         
            +
                                            // merge split expert into a single tensor for compatibility with older models
         
     | 
| 
      
 4578 
     | 
    
         
            +
                                            // requires disabling mmap
         
     | 
| 
      
 4579 
     | 
    
         
            +
                                            use_mmap_buffer = false;
         
     | 
| 
      
 4580 
     | 
    
         
            +
             
     | 
| 
      
 4581 
     | 
    
         
            +
                                            ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4582 
     | 
    
         
            +
                                            ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4583 
     | 
    
         
            +
                                            ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4584 
     | 
    
         
            +
             
     | 
| 
      
 4585 
     | 
    
         
            +
                                            layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
         
     | 
| 
      
 4586 
     | 
    
         
            +
                                            layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down,   n_ff, n_embd, n_expert);
         
     | 
| 
      
 4587 
     | 
    
         
            +
                                            layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
         
     | 
| 
      
 4588 
     | 
    
         
            +
             
     | 
| 
      
 4589 
     | 
    
         
            +
                                            ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
         
     | 
| 
      
 4590 
     | 
    
         
            +
                                            ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
         
     | 
| 
      
 4591 
     | 
    
         
            +
                                            ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
         
     | 
| 
      
 4592 
     | 
    
         
            +
             
     | 
| 
      
 4593 
     | 
    
         
            +
                                            for (uint32_t x = 0; x < n_expert; ++x) {
         
     | 
| 
      
 4594 
     | 
    
         
            +
                                                // the individual experts are loaded into a view of the merged tensor
         
     | 
| 
      
 4595 
     | 
    
         
            +
                                                ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
         
     | 
| 
      
 4596 
     | 
    
         
            +
                                                ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
         
     | 
| 
      
 4597 
     | 
    
         
            +
                                                ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
         
     | 
| 
      
 4598 
     | 
    
         
            +
                                            }
         
     | 
| 
      
 4599 
     | 
    
         
            +
                                        }
         
     | 
| 
      
 4600 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 4601 
     | 
    
         
            +
                                }
         
     | 
| 
      
 4602 
     | 
    
         
            +
                            } break;
         
     | 
| 
      
 4603 
     | 
    
         
            +
                        case LLM_ARCH_GROK:
         
     | 
| 
      
 4604 
     | 
    
         
            +
                            {
         
     | 
| 
      
 4605 
     | 
    
         
            +
                                if (n_expert == 0) {
         
     | 
| 
      
 4606 
     | 
    
         
            +
                                    throw std::runtime_error("Grok model cannot have zero experts");
         
     | 
| 
      
 4607 
     | 
    
         
            +
                                }
         
     | 
| 
      
 4608 
     | 
    
         
            +
             
     | 
| 
      
 4609 
     | 
    
         
            +
                                model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
         
     | 
| 
      
 4610 
     | 
    
         
            +
             
     | 
| 
      
 4611 
     | 
    
         
            +
                                // output
         
     | 
| 
      
 4612 
     | 
    
         
            +
                                {
         
     | 
| 
      
 4613 
     | 
    
         
            +
                                    model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
         
     | 
| 
      
 4614 
     | 
    
         
            +
                                    model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
         
     | 
| 
      
 4615 
     | 
    
         
            +
                                    // if output is NULL, init from the input tok embed
         
     | 
| 
      
 4616 
     | 
    
         
            +
                                    if (model.output == NULL) {
         
     | 
| 
      
 4617 
     | 
    
         
            +
                                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
         
     | 
| 
      
 4618 
     | 
    
         
            +
                                        ml.n_created--; // artificial tensor
         
     | 
| 
      
 4619 
     | 
    
         
            +
                                        ml.size_data += ggml_nbytes(model.output);
         
     | 
| 
      
 4620 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 4621 
     | 
    
         
            +
                                }
         
     | 
| 
      
 4622 
     | 
    
         
            +
             
     | 
| 
      
 4623 
     | 
    
         
            +
                                for (int i = 0; i < n_layer; ++i) {
         
     | 
| 
      
 4624 
     | 
    
         
            +
                                    ggml_context * ctx_layer = ctx_for_layer(i);
         
     | 
| 
      
 4625 
     | 
    
         
            +
                                    ggml_context * ctx_split = ctx_for_layer_split(i);
         
     | 
| 
      
 4626 
     | 
    
         
            +
             
     | 
| 
      
 4627 
     | 
    
         
            +
                                    auto & layer = model.layers[i];
         
     | 
| 
      
 4628 
     | 
    
         
            +
             
     | 
| 
      
 4629 
     | 
    
         
            +
                                    layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
         
     | 
| 
      
 4630 
     | 
    
         
            +
             
     | 
| 
      
 4631 
     | 
    
         
            +
                                    layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
         
     | 
| 
      
 4632 
     | 
    
         
            +
                                    layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
      
 4633 
     | 
    
         
            +
                                    layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
      
 4634 
     | 
    
         
            +
                                    layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
         
     | 
| 
      
 4635 
     | 
    
         
            +
             
     | 
| 
      
 4636 
     | 
    
         
            +
                                    layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
         
     | 
| 
      
 4637 
     | 
    
         
            +
             
     | 
| 
      
 4638 
     | 
    
         
            +
                                    layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
         
     | 
| 
      
 4639 
     | 
    
         
            +
             
     | 
| 
      
 4640 
     | 
    
         
            +
                                    layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
         
     | 
| 
      
 4641 
     | 
    
         
            +
             
     | 
| 
      
 4642 
     | 
    
         
            +
                                    layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
         
     | 
| 
      
 4643 
     | 
    
         
            +
                                    if (layer.ffn_gate_exps) {
         
     | 
| 
      
 4644 
     | 
    
         
            +
                                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert});
         
     | 
| 
      
 4645 
     | 
    
         
            +
                                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert});
         
     | 
| 
      
 4646 
     | 
    
         
            +
                                    } else {
         
     | 
| 
      
 4647 
     | 
    
         
            +
                                        // merge split expert into a single tensor for compatibility with older models
         
     | 
| 
      
 4648 
     | 
    
         
            +
                                        // requires disabling mmap
         
     | 
| 
      
 4649 
     | 
    
         
            +
                                        use_mmap_buffer = false;
         
     | 
| 
      
 4650 
     | 
    
         
            +
             
     | 
| 
      
 4651 
     | 
    
         
            +
                                        ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4652 
     | 
    
         
            +
                                        ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4653 
     | 
    
         
            +
                                        ggml_type type_up   = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, 0).c_str())->type;
         
     | 
| 
      
 4654 
     | 
    
         
            +
             
     | 
| 
      
 4655 
     | 
    
         
            +
                                        layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd,   n_ff, n_expert);
         
     | 
| 
      
 4656 
     | 
    
         
            +
                                        layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down,   n_ff, n_embd, n_expert);
         
     | 
| 
      
 4657 
     | 
    
         
            +
                                        layer.ffn_up_exps   = ggml_new_tensor_3d(ctx_split, type_up,   n_embd,   n_ff, n_expert);
         
     | 
| 
      
 4658 
     | 
    
         
            +
             
     | 
| 
      
 4659 
     | 
    
         
            +
                                        ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
         
     | 
| 
      
 4660 
     | 
    
         
            +
                                        ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
         
     | 
| 
      
 4661 
     | 
    
         
            +
                                        ggml_set_name(layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i).c_str());
         
     | 
| 
      
 4662 
     | 
    
         
            +
             
     | 
| 
      
 4663 
     | 
    
         
            +
                                        for (uint32_t x = 0; x < n_expert; ++x) {
         
     | 
| 
      
 4664 
     | 
    
         
            +
                                            // the individual experts are loaded into a view of the merged tensor
         
     | 
| 
      
 4665 
     | 
    
         
            +
                                            ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
         
     | 
| 
      
 4666 
     | 
    
         
            +
                                            ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
         
     | 
| 
      
 4667 
     | 
    
         
            +
                                            ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps,   tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
         
     | 
| 
       4264 
4668 
     | 
    
         
             
                                        }
         
     | 
| 
       4265 
4669 
     | 
    
         
             
                                    }
         
     | 
| 
      
 4670 
     | 
    
         
            +
             
     | 
| 
      
 4671 
     | 
    
         
            +
                                    layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
         
     | 
| 
       4266 
4672 
     | 
    
         
             
                                }
         
     | 
| 
       4267 
4673 
     | 
    
         
             
                            } break;
         
     | 
| 
       4268 
4674 
     | 
    
         
             
                        case LLM_ARCH_BAICHUAN:
         
     | 
| 
         @@ -4319,10 +4725,8 @@ static bool llm_load_tensors( 
     | 
|
| 
       4319 
4725 
     | 
    
         
             
                                    layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
         
     | 
| 
       4320 
4726 
     | 
    
         
             
                                    layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
         
     | 
| 
       4321 
4727 
     | 
    
         | 
| 
       4322 
     | 
    
         
            -
                                     
     | 
| 
       4323 
     | 
    
         
            -
             
     | 
| 
       4324 
     | 
    
         
            -
                                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
         
     | 
| 
       4325 
     | 
    
         
            -
                                    }
         
     | 
| 
      
 4728 
     | 
    
         
            +
                                    layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
         
     | 
| 
      
 4729 
     | 
    
         
            +
                                    layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, false);
         
     | 
| 
       4326 
4730 
     | 
    
         | 
| 
       4327 
4731 
     | 
    
         
             
                                    layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
         
     | 
| 
       4328 
4732 
     | 
    
         
             
                                    layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
         
     | 
| 
         @@ -4502,6 +4906,7 @@ static bool llm_load_tensors( 
     | 
|
| 
       4502 
4906 
     | 
    
         
             
                        case LLM_ARCH_MPT:
         
     | 
| 
       4503 
4907 
     | 
    
         
             
                            {
         
     | 
| 
       4504 
4908 
     | 
    
         
             
                                model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
         
     | 
| 
      
 4909 
     | 
    
         
            +
                                model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, hparams.n_ctx_train}, false);
         
     | 
| 
       4505 
4910 
     | 
    
         | 
| 
       4506 
4911 
     | 
    
         
             
                                // output
         
     | 
| 
       4507 
4912 
     | 
    
         
             
                                {
         
     | 
| 
         @@ -4540,6 +4945,12 @@ static bool llm_load_tensors( 
     | 
|
| 
       4540 
4945 
     | 
    
         
             
                                    layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
         
     | 
| 
       4541 
4946 
     | 
    
         
             
                                    layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);
         
     | 
| 
       4542 
4947 
     | 
    
         | 
| 
      
 4948 
     | 
    
         
            +
                                    layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
         
     | 
| 
      
 4949 
     | 
    
         
            +
                                    layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, false);
         
     | 
| 
      
 4950 
     | 
    
         
            +
             
     | 
| 
      
 4951 
     | 
    
         
            +
                                    layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
         
     | 
| 
      
 4952 
     | 
    
         
            +
                                    layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, false);
         
     | 
| 
      
 4953 
     | 
    
         
            +
             
     | 
| 
       4543 
4954 
     | 
    
         
             
                                    // AWQ ScaleActivation layer
         
     | 
| 
       4544 
4955 
     | 
    
         
             
                                    layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
         
     | 
| 
       4545 
4956 
     | 
    
         
             
                                }
         
     | 
| 
         @@ -4986,6 +5397,28 @@ static bool llm_load_tensors( 
     | 
|
| 
       4986 
5397 
     | 
    
         
             
                                    layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
         
     | 
| 
       4987 
5398 
     | 
    
         
             
                                }
         
     | 
| 
       4988 
5399 
     | 
    
         
             
                            } break;
         
     | 
| 
      
 5400 
     | 
    
         
            +
                        case LLM_ARCH_XVERSE:
         
     | 
| 
      
 5401 
     | 
    
         
            +
                            {
         
     | 
| 
      
 5402 
     | 
    
         
            +
                                model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
         
     | 
| 
      
 5403 
     | 
    
         
            +
                                {
         
     | 
| 
      
 5404 
     | 
    
         
            +
                                    model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
         
     | 
| 
      
 5405 
     | 
    
         
            +
                                    model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
         
     | 
| 
      
 5406 
     | 
    
         
            +
                                }
         
     | 
| 
      
 5407 
     | 
    
         
            +
                                for (int i = 0; i < n_layer; ++i) {
         
     | 
| 
      
 5408 
     | 
    
         
            +
                                    ggml_context * ctx_layer = ctx_for_layer(i);
         
     | 
| 
      
 5409 
     | 
    
         
            +
                                    ggml_context * ctx_split = ctx_for_layer_split(i);
         
     | 
| 
      
 5410 
     | 
    
         
            +
                                    auto & layer = model.layers[i];
         
     | 
| 
      
 5411 
     | 
    
         
            +
                                    layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
         
     | 
| 
      
 5412 
     | 
    
         
            +
                                    layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
         
     | 
| 
      
 5413 
     | 
    
         
            +
                                    layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
      
 5414 
     | 
    
         
            +
                                    layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
      
 5415 
     | 
    
         
            +
                                    layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
         
     | 
| 
      
 5416 
     | 
    
         
            +
                                    layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
         
     | 
| 
      
 5417 
     | 
    
         
            +
                                    layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
         
     | 
| 
      
 5418 
     | 
    
         
            +
                                    layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
         
     | 
| 
      
 5419 
     | 
    
         
            +
                                    layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
         
     | 
| 
      
 5420 
     | 
    
         
            +
                                }
         
     | 
| 
      
 5421 
     | 
    
         
            +
                            } break;
         
     | 
| 
       4989 
5422 
     | 
    
         
             
                        case LLM_ARCH_COMMAND_R:
         
     | 
| 
       4990 
5423 
     | 
    
         
             
                            {
         
     | 
| 
       4991 
5424 
     | 
    
         
             
                                model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
         
     | 
| 
         @@ -5007,6 +5440,11 @@ static bool llm_load_tensors( 
     | 
|
| 
       5007 
5440 
     | 
    
         | 
| 
       5008 
5441 
     | 
    
         
             
                                    layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
         
     | 
| 
       5009 
5442 
     | 
    
         | 
| 
      
 5443 
     | 
    
         
            +
                                    if (n_layer >= 64){
         
     | 
| 
      
 5444 
     | 
    
         
            +
                                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
         
     | 
| 
      
 5445 
     | 
    
         
            +
                                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
         
     | 
| 
      
 5446 
     | 
    
         
            +
                                    }
         
     | 
| 
      
 5447 
     | 
    
         
            +
             
     | 
| 
       5010 
5448 
     | 
    
         
             
                                    layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
         
     | 
| 
       5011 
5449 
     | 
    
         
             
                                    layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
       5012 
5450 
     | 
    
         
             
                                    layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
         
     | 
| 
         @@ -5024,56 +5462,97 @@ static bool llm_load_tensors( 
     | 
|
| 
       5024 
5462 
     | 
    
         | 
| 
       5025 
5463 
     | 
    
         
             
                ml.done_getting_tensors();
         
     | 
| 
       5026 
5464 
     | 
    
         | 
| 
       5027 
     | 
    
         
            -
                ml. 
     | 
| 
      
 5465 
     | 
    
         
            +
                ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
         
     | 
| 
      
 5466 
     | 
    
         
            +
                model.mappings.reserve(ml.mappings.size());
         
     | 
| 
       5028 
5467 
     | 
    
         | 
| 
       5029 
5468 
     | 
    
         
             
                // create the backend buffers
         
     | 
| 
       5030 
     | 
    
         
            -
                std::vector<std::pair<ggml_context *,  
     | 
| 
      
 5469 
     | 
    
         
            +
                std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
         
     | 
| 
      
 5470 
     | 
    
         
            +
                ctx_bufs.reserve(ctx_map.size());
         
     | 
| 
      
 5471 
     | 
    
         
            +
             
     | 
| 
      
 5472 
     | 
    
         
            +
                // Ensure we have enough capacity for the maximum backend buffer we will potentially create
         
     | 
| 
      
 5473 
     | 
    
         
            +
                size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
         
     | 
| 
      
 5474 
     | 
    
         
            +
                model.bufs.reserve(n_max_backend_buffer);
         
     | 
| 
       5031 
5475 
     | 
    
         | 
| 
       5032 
5476 
     | 
    
         
             
                for (auto & it : ctx_map) {
         
     | 
| 
       5033 
5477 
     | 
    
         
             
                    ggml_backend_buffer_type_t buft = it.first;
         
     | 
| 
       5034 
     | 
    
         
            -
                    ggml_context * ctx 
     | 
| 
       5035 
     | 
    
         
            -
             
     | 
| 
      
 5478 
     | 
    
         
            +
                    ggml_context * ctx              = it.second;
         
     | 
| 
      
 5479 
     | 
    
         
            +
             
     | 
| 
      
 5480 
     | 
    
         
            +
                    llama_buf_map bufs;
         
     | 
| 
      
 5481 
     | 
    
         
            +
                    bufs.reserve(n_max_backend_buffer);
         
     | 
| 
       5036 
5482 
     | 
    
         | 
| 
       5037 
5483 
     | 
    
         
             
                    // only the mmap region containing the tensors in the model is mapped to the backend buffer
         
     | 
| 
       5038 
5484 
     | 
    
         
             
                    // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
         
     | 
| 
       5039 
5485 
     | 
    
         
             
                    // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
         
     | 
| 
       5040 
     | 
    
         
            -
                    if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
         
     | 
| 
       5041 
     | 
    
         
            -
                         
     | 
| 
       5042 
     | 
    
         
            -
             
     | 
| 
       5043 
     | 
    
         
            -
             
     | 
| 
       5044 
     | 
    
         
            -
             
     | 
| 
       5045 
     | 
    
         
            -
             
     | 
| 
       5046 
     | 
    
         
            -
             
     | 
| 
      
 5486 
     | 
    
         
            +
                    if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
         
     | 
| 
      
 5487 
     | 
    
         
            +
                        for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
         
     | 
| 
      
 5488 
     | 
    
         
            +
                            void * addr = nullptr;
         
     | 
| 
      
 5489 
     | 
    
         
            +
                            size_t first, last;
         
     | 
| 
      
 5490 
     | 
    
         
            +
                            ml.get_mapping_range(&first, &last, &addr, idx, ctx);
         
     | 
| 
      
 5491 
     | 
    
         
            +
                            if (first >= last) {
         
     | 
| 
      
 5492 
     | 
    
         
            +
                                continue;
         
     | 
| 
      
 5493 
     | 
    
         
            +
                            }
         
     | 
| 
      
 5494 
     | 
    
         
            +
                            ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
         
     | 
| 
      
 5495 
     | 
    
         
            +
                            if (buf == nullptr) {
         
     | 
| 
      
 5496 
     | 
    
         
            +
                                throw std::runtime_error("unable to allocate backend CPU buffer");
         
     | 
| 
      
 5497 
     | 
    
         
            +
                            }
         
     | 
| 
      
 5498 
     | 
    
         
            +
                            model.bufs.push_back(buf);
         
     | 
| 
      
 5499 
     | 
    
         
            +
                            bufs.emplace(idx, buf);
         
     | 
| 
      
 5500 
     | 
    
         
            +
            #ifdef GGML_USE_CUDA
         
     | 
| 
      
 5501 
     | 
    
         
            +
                            if (n_layer >= n_gpu_layers) {
         
     | 
| 
      
 5502 
     | 
    
         
            +
                                ggml_backend_cuda_register_host_buffer(
         
     | 
| 
       5047 
5503 
     | 
    
         
             
                                    ggml_backend_buffer_get_base(buf),
         
     | 
| 
       5048 
5504 
     | 
    
         
             
                                    ggml_backend_buffer_get_size(buf));
         
     | 
| 
       5049 
     | 
    
         
            -
             
     | 
| 
      
 5505 
     | 
    
         
            +
                            }
         
     | 
| 
       5050 
5506 
     | 
    
         
             
            #endif
         
     | 
| 
      
 5507 
     | 
    
         
            +
                        }
         
     | 
| 
       5051 
5508 
     | 
    
         
             
                    }
         
     | 
| 
       5052 
5509 
     | 
    
         
             
            #ifdef GGML_USE_METAL
         
     | 
| 
       5053 
     | 
    
         
            -
                    else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
         
     | 
| 
       5054 
     | 
    
         
            -
                         
     | 
| 
       5055 
     | 
    
         
            -
             
     | 
| 
       5056 
     | 
    
         
            -
             
     | 
| 
       5057 
     | 
    
         
            -
             
     | 
| 
      
 5510 
     | 
    
         
            +
                    else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
         
     | 
| 
      
 5511 
     | 
    
         
            +
                        for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
         
     | 
| 
      
 5512 
     | 
    
         
            +
                            const size_t max_size = ggml_get_max_tensor_size(ctx);
         
     | 
| 
      
 5513 
     | 
    
         
            +
                            void * addr = nullptr;
         
     | 
| 
      
 5514 
     | 
    
         
            +
                            size_t first, last;
         
     | 
| 
      
 5515 
     | 
    
         
            +
                            ml.get_mapping_range(&first, &last, &addr, idx, ctx);
         
     | 
| 
      
 5516 
     | 
    
         
            +
                            if (first >= last) {
         
     | 
| 
      
 5517 
     | 
    
         
            +
                                continue;
         
     | 
| 
      
 5518 
     | 
    
         
            +
                            }
         
     | 
| 
      
 5519 
     | 
    
         
            +
                            ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
         
     | 
| 
      
 5520 
     | 
    
         
            +
                            if (buf == nullptr) {
         
     | 
| 
      
 5521 
     | 
    
         
            +
                                throw std::runtime_error("unable to allocate backend metal buffer");
         
     | 
| 
      
 5522 
     | 
    
         
            +
                            }
         
     | 
| 
      
 5523 
     | 
    
         
            +
                            model.bufs.push_back(buf);
         
     | 
| 
      
 5524 
     | 
    
         
            +
                            bufs.emplace(idx, buf);
         
     | 
| 
      
 5525 
     | 
    
         
            +
                        }
         
     | 
| 
       5058 
5526 
     | 
    
         
             
                    }
         
     | 
| 
       5059 
5527 
     | 
    
         
             
            #endif
         
     | 
| 
       5060 
5528 
     | 
    
         
             
                    else {
         
     | 
| 
       5061 
     | 
    
         
            -
                        buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         
     | 
| 
       5062 
     | 
    
         
            -
                        if (buf  
     | 
| 
      
 5529 
     | 
    
         
            +
                        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         
     | 
| 
      
 5530 
     | 
    
         
            +
                        if (buf == nullptr) {
         
     | 
| 
      
 5531 
     | 
    
         
            +
                            throw std::runtime_error("unable to allocate backend buffer");
         
     | 
| 
      
 5532 
     | 
    
         
            +
                        }
         
     | 
| 
      
 5533 
     | 
    
         
            +
                        model.bufs.push_back(buf);
         
     | 
| 
      
 5534 
     | 
    
         
            +
                        if (use_mlock && ggml_backend_buffer_is_host(buf)) {
         
     | 
| 
       5063 
5535 
     | 
    
         
             
                            model.mlock_bufs.emplace_back(new llama_mlock);
         
     | 
| 
       5064 
5536 
     | 
    
         
             
                            auto & mlock_buf = model.mlock_bufs.back();
         
     | 
| 
       5065 
5537 
     | 
    
         
             
                            mlock_buf->init   (ggml_backend_buffer_get_base(buf));
         
     | 
| 
       5066 
5538 
     | 
    
         
             
                            mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
         
     | 
| 
       5067 
5539 
     | 
    
         
             
                        }
         
     | 
| 
      
 5540 
     | 
    
         
            +
                        for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
         
     | 
| 
      
 5541 
     | 
    
         
            +
                            bufs.emplace(idx, buf);
         
     | 
| 
      
 5542 
     | 
    
         
            +
                        }
         
     | 
| 
       5068 
5543 
     | 
    
         
             
                    }
         
     | 
| 
       5069 
     | 
    
         
            -
             
     | 
| 
      
 5544 
     | 
    
         
            +
             
     | 
| 
      
 5545 
     | 
    
         
            +
                    if (bufs.empty()) {
         
     | 
| 
       5070 
5546 
     | 
    
         
             
                        throw std::runtime_error("failed to allocate buffer");
         
     | 
| 
       5071 
5547 
     | 
    
         
             
                    }
         
     | 
| 
       5072 
     | 
    
         
            -
             
     | 
| 
       5073 
     | 
    
         
            -
                     
     | 
| 
       5074 
     | 
    
         
            -
             
     | 
| 
       5075 
     | 
    
         
            -
             
     | 
| 
       5076 
     | 
    
         
            -
             
     | 
| 
      
 5548 
     | 
    
         
            +
             
     | 
| 
      
 5549 
     | 
    
         
            +
                    for (auto & buf : bufs) {
         
     | 
| 
      
 5550 
     | 
    
         
            +
                        // indicate that this buffer contains weights
         
     | 
| 
      
 5551 
     | 
    
         
            +
                        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
         
     | 
| 
      
 5552 
     | 
    
         
            +
                        ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
         
     | 
| 
      
 5553 
     | 
    
         
            +
                    }
         
     | 
| 
      
 5554 
     | 
    
         
            +
             
     | 
| 
      
 5555 
     | 
    
         
            +
                    ctx_bufs.emplace_back(ctx, bufs);
         
     | 
| 
       5077 
5556 
     | 
    
         
             
                }
         
     | 
| 
       5078 
5557 
     | 
    
         | 
| 
       5079 
5558 
     | 
    
         
             
                if (llama_supports_gpu_offload()) {
         
     | 
| 
         @@ -5105,13 +5584,17 @@ static bool llm_load_tensors( 
     | 
|
| 
       5105 
5584 
     | 
    
         
             
                // load tensor data
         
     | 
| 
       5106 
5585 
     | 
    
         
             
                for (auto & it : ctx_bufs) {
         
     | 
| 
       5107 
5586 
     | 
    
         
             
                    ggml_context * ctx = it.first;
         
     | 
| 
       5108 
     | 
    
         
            -
                     
     | 
| 
       5109 
     | 
    
         
            -
                    if (!ml.load_all_data(ctx,  
     | 
| 
      
 5587 
     | 
    
         
            +
                    auto & bufs = it.second;
         
     | 
| 
      
 5588 
     | 
    
         
            +
                    if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
         
     | 
| 
       5110 
5589 
     | 
    
         
             
                        return false;
         
     | 
| 
       5111 
5590 
     | 
    
         
             
                    }
         
     | 
| 
       5112 
5591 
     | 
    
         
             
                }
         
     | 
| 
       5113 
5592 
     | 
    
         | 
| 
       5114 
     | 
    
         
            -
                 
     | 
| 
      
 5593 
     | 
    
         
            +
                if (use_mmap_buffer) {
         
     | 
| 
      
 5594 
     | 
    
         
            +
                    for (auto & mapping : ml.mappings) {
         
     | 
| 
      
 5595 
     | 
    
         
            +
                        model.mappings.emplace_back(std::move(mapping));
         
     | 
| 
      
 5596 
     | 
    
         
            +
                    }
         
     | 
| 
      
 5597 
     | 
    
         
            +
                }
         
     | 
| 
       5115 
5598 
     | 
    
         | 
| 
       5116 
5599 
     | 
    
         
             
                // loading time will be recalculate after the first eval, so
         
     | 
| 
       5117 
5600 
     | 
    
         
             
                // we take page faults deferred by mmap() into consideration
         
     | 
| 
         @@ -5266,8 +5749,8 @@ static void llm_build_kv_store( 
     | 
|
| 
       5266 
5749 
     | 
    
         
             
                GGML_ASSERT(kv.size == n_ctx);
         
     | 
| 
       5267 
5750 
     | 
    
         | 
| 
       5268 
5751 
     | 
    
         
             
                // compute the transposed [n_tokens, n_embd] V matrix
         
     | 
| 
       5269 
     | 
    
         
            -
                 
     | 
| 
       5270 
     | 
    
         
            -
                 
     | 
| 
      
 5752 
     | 
    
         
            +
                assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
         
     | 
| 
      
 5753 
     | 
    
         
            +
                struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
         
     | 
| 
       5271 
5754 
     | 
    
         
             
                cb(v_cur_t, "v_cur_t", il);
         
     | 
| 
       5272 
5755 
     | 
    
         | 
| 
       5273 
5756 
     | 
    
         
             
                struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
         
     | 
| 
         @@ -5451,6 +5934,20 @@ static struct ggml_tensor * llm_build_kqv( 
     | 
|
| 
       5451 
5934 
     | 
    
         
             
                    ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
         
     | 
| 
       5452 
5935 
     | 
    
         
             
                }
         
     | 
| 
       5453 
5936 
     | 
    
         | 
| 
      
 5937 
     | 
    
         
            +
                if (model.arch == LLM_ARCH_GROK) {
         
     | 
| 
      
 5938 
     | 
    
         
            +
                    // need to do the following:
         
     | 
| 
      
 5939 
     | 
    
         
            +
                    // multiply by attn_output_multiplyer of 0.08838834764831845
         
     | 
| 
      
 5940 
     | 
    
         
            +
                    // and then :
         
     | 
| 
      
 5941 
     | 
    
         
            +
                    // kq = 30 * tanh(kq / 30)
         
     | 
| 
      
 5942 
     | 
    
         
            +
                    // before the softmax below
         
     | 
| 
      
 5943 
     | 
    
         
            +
             
     | 
| 
      
 5944 
     | 
    
         
            +
                    //try from phi2
         
     | 
| 
      
 5945 
     | 
    
         
            +
                    //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
         
     | 
| 
      
 5946 
     | 
    
         
            +
             
     | 
| 
      
 5947 
     | 
    
         
            +
                    kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
         
     | 
| 
      
 5948 
     | 
    
         
            +
                    kq = ggml_scale(ctx, kq, 30);
         
     | 
| 
      
 5949 
     | 
    
         
            +
                }
         
     | 
| 
      
 5950 
     | 
    
         
            +
             
     | 
| 
       5454 
5951 
     | 
    
         
             
            #if defined(GGML_USE_KOMPUTE)
         
     | 
| 
       5455 
5952 
     | 
    
         
             
            #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
         
     | 
| 
       5456 
5953 
     | 
    
         
             
            #pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024")
         
     | 
| 
         @@ -5577,7 +6074,8 @@ struct llm_build_context { 
     | 
|
| 
       5577 
6074 
     | 
    
         
             
                const float norm_rms_eps;
         
     | 
| 
       5578 
6075 
     | 
    
         | 
| 
       5579 
6076 
     | 
    
         
             
                const int32_t n_tokens;
         
     | 
| 
       5580 
     | 
    
         
            -
                const int32_t n_kv;     // size of KV cache to consider (n_kv <=  
     | 
| 
      
 6077 
     | 
    
         
            +
                const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
         
     | 
| 
      
 6078 
     | 
    
         
            +
                const int32_t n_outputs;
         
     | 
| 
       5581 
6079 
     | 
    
         
             
                const int32_t kv_head;  // index of where we store new KV data in the cache
         
     | 
| 
       5582 
6080 
     | 
    
         
             
                const int32_t n_orig_ctx;
         
     | 
| 
       5583 
6081 
     | 
    
         | 
| 
         @@ -5624,6 +6122,7 @@ struct llm_build_context { 
     | 
|
| 
       5624 
6122 
     | 
    
         
             
                    norm_rms_eps     (hparams.f_norm_rms_eps),
         
     | 
| 
       5625 
6123 
     | 
    
         
             
                    n_tokens         (batch.n_tokens),
         
     | 
| 
       5626 
6124 
     | 
    
         
             
                    n_kv             (worst_case ? kv_self.size : kv_self.n),
         
     | 
| 
      
 6125 
     | 
    
         
            +
                    n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         
     | 
| 
       5627 
6126 
     | 
    
         
             
                    kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         
     | 
| 
       5628 
6127 
     | 
    
         
             
                    n_orig_ctx       (cparams.n_yarn_orig_ctx),
         
     | 
| 
       5629 
6128 
     | 
    
         
             
                    pooling_type     (cparams.pooling_type),
         
     | 
| 
         @@ -5645,6 +6144,7 @@ struct llm_build_context { 
     | 
|
| 
       5645 
6144 
     | 
    
         
             
                    lctx.inp_tokens = nullptr;
         
     | 
| 
       5646 
6145 
     | 
    
         
             
                    lctx.inp_embd = nullptr;
         
     | 
| 
       5647 
6146 
     | 
    
         
             
                    lctx.inp_pos = nullptr;
         
     | 
| 
      
 6147 
     | 
    
         
            +
                    lctx.inp_out_ids = nullptr;
         
     | 
| 
       5648 
6148 
     | 
    
         
             
                    lctx.inp_KQ_mask = nullptr;
         
     | 
| 
       5649 
6149 
     | 
    
         
             
                    lctx.inp_KQ_pos = nullptr;
         
     | 
| 
       5650 
6150 
     | 
    
         
             
                    lctx.inp_K_shift = nullptr;
         
     | 
| 
         @@ -5768,6 +6268,13 @@ struct llm_build_context { 
     | 
|
| 
       5768 
6268 
     | 
    
         
             
                    return lctx.inp_pos;
         
     | 
| 
       5769 
6269 
     | 
    
         
             
                }
         
     | 
| 
       5770 
6270 
     | 
    
         | 
| 
      
 6271 
     | 
    
         
            +
                struct ggml_tensor * build_inp_out_ids() {
         
     | 
| 
      
 6272 
     | 
    
         
            +
                    lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
         
     | 
| 
      
 6273 
     | 
    
         
            +
                    cb(lctx.inp_out_ids, "inp_out_ids", -1);
         
     | 
| 
      
 6274 
     | 
    
         
            +
                    ggml_set_input(lctx.inp_out_ids);
         
     | 
| 
      
 6275 
     | 
    
         
            +
                    return lctx.inp_out_ids;
         
     | 
| 
      
 6276 
     | 
    
         
            +
                }
         
     | 
| 
      
 6277 
     | 
    
         
            +
             
     | 
| 
       5771 
6278 
     | 
    
         
             
                struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
         
     | 
| 
       5772 
6279 
     | 
    
         
             
                    if (causal) {
         
     | 
| 
       5773 
6280 
     | 
    
         
             
                        lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens);
         
     | 
| 
         @@ -5824,6 +6331,9 @@ struct llm_build_context { 
     | 
|
| 
       5824 
6331 
     | 
    
         
             
                struct ggml_cgraph * build_llama() {
         
     | 
| 
       5825 
6332 
     | 
    
         
             
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
       5826 
6333 
     | 
    
         | 
| 
      
 6334 
     | 
    
         
            +
                    // mutable variable, needed during the last layer of the computation to skip unused tokens
         
     | 
| 
      
 6335 
     | 
    
         
            +
                    int32_t n_tokens = this->n_tokens;
         
     | 
| 
      
 6336 
     | 
    
         
            +
             
     | 
| 
       5827 
6337 
     | 
    
         
             
                    const int64_t n_embd_head = hparams.n_embd_head_v;
         
     | 
| 
       5828 
6338 
     | 
    
         
             
                    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         
     | 
| 
       5829 
6339 
     | 
    
         
             
                    GGML_ASSERT(n_embd_head == hparams.n_rot);
         
     | 
| 
         @@ -5891,6 +6401,14 @@ struct llm_build_context { 
     | 
|
| 
       5891 
6401 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       5892 
6402 
     | 
    
         
             
                        }
         
     | 
| 
       5893 
6403 
     | 
    
         | 
| 
      
 6404 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 6405 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 6406 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 6407 
     | 
    
         
            +
                            n_tokens = n_outputs;
         
     | 
| 
      
 6408 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 6409 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 6410 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6411 
     | 
    
         
            +
             
     | 
| 
       5894 
6412 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       5895 
6413 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       5896 
6414 
     | 
    
         | 
| 
         @@ -5943,19 +6461,19 @@ struct llm_build_context { 
     | 
|
| 
       5943 
6461 
     | 
    
         
             
                            for (int i = 0; i < n_expert_used; ++i) {
         
     | 
| 
       5944 
6462 
     | 
    
         
             
                                ggml_tensor * cur_expert;
         
     | 
| 
       5945 
6463 
     | 
    
         | 
| 
       5946 
     | 
    
         
            -
                                ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il]. 
     | 
| 
      
 6464 
     | 
    
         
            +
                                ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
         
     | 
| 
       5947 
6465 
     | 
    
         
             
                                cb(cur_up, "ffn_moe_up", il);
         
     | 
| 
       5948 
6466 
     | 
    
         | 
| 
       5949 
     | 
    
         
            -
                                ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il]. 
     | 
| 
      
 6467 
     | 
    
         
            +
                                ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
         
     | 
| 
       5950 
6468 
     | 
    
         
             
                                cb(cur_gate, "ffn_moe_gate", il);
         
     | 
| 
       5951 
6469 
     | 
    
         | 
| 
       5952 
6470 
     | 
    
         
             
                                cur_gate = ggml_silu(ctx0, cur_gate);
         
     | 
| 
       5953 
6471 
     | 
    
         
             
                                cb(cur_gate, "ffn_moe_silu", il);
         
     | 
| 
       5954 
6472 
     | 
    
         | 
| 
       5955 
     | 
    
         
            -
                                cur_expert = ggml_mul(ctx0, cur_up, cur_gate); 
     | 
| 
      
 6473 
     | 
    
         
            +
                                cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
         
     | 
| 
       5956 
6474 
     | 
    
         
             
                                cb(cur_expert, "ffn_moe_gate_par", il);
         
     | 
| 
       5957 
6475 
     | 
    
         | 
| 
       5958 
     | 
    
         
            -
                                cur_expert = ggml_mul_mat_id(ctx0, model.layers[il]. 
     | 
| 
      
 6476 
     | 
    
         
            +
                                cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
         
     | 
| 
       5959 
6477 
     | 
    
         
             
                                cb(cur_expert, "ffn_moe_down", il);
         
     | 
| 
       5960 
6478 
     | 
    
         | 
| 
       5961 
6479 
     | 
    
         
             
                                cur_expert = ggml_mul(ctx0, cur_expert,
         
     | 
| 
         @@ -6070,6 +6588,13 @@ struct llm_build_context { 
     | 
|
| 
       6070 
6588 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       6071 
6589 
     | 
    
         
             
                        }
         
     | 
| 
       6072 
6590 
     | 
    
         | 
| 
      
 6591 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 6592 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 6593 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 6594 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 6595 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 6596 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6597 
     | 
    
         
            +
             
     | 
| 
       6073 
6598 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       6074 
6599 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       6075 
6600 
     | 
    
         | 
| 
         @@ -6112,6 +6637,111 @@ struct llm_build_context { 
     | 
|
| 
       6112 
6637 
     | 
    
         
             
                    return gf;
         
     | 
| 
       6113 
6638 
     | 
    
         
             
                }
         
     | 
| 
       6114 
6639 
     | 
    
         | 
| 
      
 6640 
     | 
    
         
            +
                struct ggml_cgraph * build_xverse() {
         
     | 
| 
      
 6641 
     | 
    
         
            +
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
      
 6642 
     | 
    
         
            +
             
     | 
| 
      
 6643 
     | 
    
         
            +
                    const int64_t n_embd_head = hparams.n_embd_head_v;
         
     | 
| 
      
 6644 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         
     | 
| 
      
 6645 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head == hparams.n_rot);
         
     | 
| 
      
 6646 
     | 
    
         
            +
             
     | 
| 
      
 6647 
     | 
    
         
            +
                    struct ggml_tensor * cur;
         
     | 
| 
      
 6648 
     | 
    
         
            +
                    struct ggml_tensor * inpL;
         
     | 
| 
      
 6649 
     | 
    
         
            +
             
     | 
| 
      
 6650 
     | 
    
         
            +
                    inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
         
     | 
| 
      
 6651 
     | 
    
         
            +
             
     | 
| 
      
 6652 
     | 
    
         
            +
                    // inp_pos - contains the positions
         
     | 
| 
      
 6653 
     | 
    
         
            +
                    struct ggml_tensor * inp_pos = build_inp_pos();
         
     | 
| 
      
 6654 
     | 
    
         
            +
             
     | 
| 
      
 6655 
     | 
    
         
            +
                    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         
     | 
| 
      
 6656 
     | 
    
         
            +
                    struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
         
     | 
| 
      
 6657 
     | 
    
         
            +
             
     | 
| 
      
 6658 
     | 
    
         
            +
                    // positions of the tokens in the KV cache
         
     | 
| 
      
 6659 
     | 
    
         
            +
                    struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
         
     | 
| 
      
 6660 
     | 
    
         
            +
             
     | 
| 
      
 6661 
     | 
    
         
            +
                    for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
      
 6662 
     | 
    
         
            +
                        struct ggml_tensor * inpSA = inpL;
         
     | 
| 
      
 6663 
     | 
    
         
            +
             
     | 
| 
      
 6664 
     | 
    
         
            +
                        cur = llm_build_norm(ctx0, inpL, hparams,
         
     | 
| 
      
 6665 
     | 
    
         
            +
                                model.layers[il].attn_norm, NULL,
         
     | 
| 
      
 6666 
     | 
    
         
            +
                                LLM_NORM_RMS, cb, il);
         
     | 
| 
      
 6667 
     | 
    
         
            +
                        cb(cur, "attn_norm", il);
         
     | 
| 
      
 6668 
     | 
    
         
            +
             
     | 
| 
      
 6669 
     | 
    
         
            +
                        // self-attention
         
     | 
| 
      
 6670 
     | 
    
         
            +
                        {
         
     | 
| 
      
 6671 
     | 
    
         
            +
                            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
         
     | 
| 
      
 6672 
     | 
    
         
            +
                            cb(Qcur, "Qcur", il);
         
     | 
| 
      
 6673 
     | 
    
         
            +
             
     | 
| 
      
 6674 
     | 
    
         
            +
                            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
         
     | 
| 
      
 6675 
     | 
    
         
            +
                            cb(Kcur, "Kcur", il);
         
     | 
| 
      
 6676 
     | 
    
         
            +
             
     | 
| 
      
 6677 
     | 
    
         
            +
                            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
         
     | 
| 
      
 6678 
     | 
    
         
            +
                            cb(Vcur, "Vcur", il);
         
     | 
| 
      
 6679 
     | 
    
         
            +
             
     | 
| 
      
 6680 
     | 
    
         
            +
                            Qcur = ggml_rope_custom(
         
     | 
| 
      
 6681 
     | 
    
         
            +
                                ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
         
     | 
| 
      
 6682 
     | 
    
         
            +
                                n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
         
     | 
| 
      
 6683 
     | 
    
         
            +
                                ext_factor, attn_factor, beta_fast, beta_slow
         
     | 
| 
      
 6684 
     | 
    
         
            +
                            );
         
     | 
| 
      
 6685 
     | 
    
         
            +
                            cb(Qcur, "Qcur", il);
         
     | 
| 
      
 6686 
     | 
    
         
            +
             
     | 
| 
      
 6687 
     | 
    
         
            +
                            Kcur = ggml_rope_custom(
         
     | 
| 
      
 6688 
     | 
    
         
            +
                                ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
         
     | 
| 
      
 6689 
     | 
    
         
            +
                                n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
         
     | 
| 
      
 6690 
     | 
    
         
            +
                                ext_factor, attn_factor, beta_fast, beta_slow
         
     | 
| 
      
 6691 
     | 
    
         
            +
                            );
         
     | 
| 
      
 6692 
     | 
    
         
            +
                            cb(Kcur, "Kcur", il);
         
     | 
| 
      
 6693 
     | 
    
         
            +
                            cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
         
     | 
| 
      
 6694 
     | 
    
         
            +
                                    model.layers[il].wo, NULL,
         
     | 
| 
      
 6695 
     | 
    
         
            +
                                    Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
      
 6696 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6697 
     | 
    
         
            +
             
     | 
| 
      
 6698 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 6699 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 6700 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 6701 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
         
     | 
| 
      
 6702 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 6703 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6704 
     | 
    
         
            +
             
     | 
| 
      
 6705 
     | 
    
         
            +
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
      
 6706 
     | 
    
         
            +
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
      
 6707 
     | 
    
         
            +
             
     | 
| 
      
 6708 
     | 
    
         
            +
                        // feed-forward network
         
     | 
| 
      
 6709 
     | 
    
         
            +
                        {
         
     | 
| 
      
 6710 
     | 
    
         
            +
                            cur = llm_build_norm(ctx0, ffn_inp, hparams,
         
     | 
| 
      
 6711 
     | 
    
         
            +
                                    model.layers[il].ffn_norm, NULL,
         
     | 
| 
      
 6712 
     | 
    
         
            +
                                    LLM_NORM_RMS, cb, il);
         
     | 
| 
      
 6713 
     | 
    
         
            +
                            cb(cur, "ffn_norm", il);
         
     | 
| 
      
 6714 
     | 
    
         
            +
             
     | 
| 
      
 6715 
     | 
    
         
            +
                            cur = llm_build_ffn(ctx0, cur,
         
     | 
| 
      
 6716 
     | 
    
         
            +
                                    model.layers[il].ffn_up,   NULL,
         
     | 
| 
      
 6717 
     | 
    
         
            +
                                    model.layers[il].ffn_gate, NULL,
         
     | 
| 
      
 6718 
     | 
    
         
            +
                                    model.layers[il].ffn_down, NULL,
         
     | 
| 
      
 6719 
     | 
    
         
            +
                                    NULL,
         
     | 
| 
      
 6720 
     | 
    
         
            +
                                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
         
     | 
| 
      
 6721 
     | 
    
         
            +
                            cb(cur, "ffn_out", il);
         
     | 
| 
      
 6722 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6723 
     | 
    
         
            +
             
     | 
| 
      
 6724 
     | 
    
         
            +
                        cur = ggml_add(ctx0, cur, ffn_inp);
         
     | 
| 
      
 6725 
     | 
    
         
            +
                        cb(cur, "l_out", il);
         
     | 
| 
      
 6726 
     | 
    
         
            +
             
     | 
| 
      
 6727 
     | 
    
         
            +
                        // input for next layer
         
     | 
| 
      
 6728 
     | 
    
         
            +
                        inpL = cur;
         
     | 
| 
      
 6729 
     | 
    
         
            +
                    }
         
     | 
| 
      
 6730 
     | 
    
         
            +
             
     | 
| 
      
 6731 
     | 
    
         
            +
                    cur = inpL;
         
     | 
| 
      
 6732 
     | 
    
         
            +
             
     | 
| 
      
 6733 
     | 
    
         
            +
                    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
         
     | 
| 
      
 6734 
     | 
    
         
            +
                    cb(cur, "result_norm", -1);
         
     | 
| 
      
 6735 
     | 
    
         
            +
             
     | 
| 
      
 6736 
     | 
    
         
            +
                    // lm_head
         
     | 
| 
      
 6737 
     | 
    
         
            +
                    cur = ggml_mul_mat(ctx0, model.output, cur);
         
     | 
| 
      
 6738 
     | 
    
         
            +
                    cb(cur, "result_output", -1);
         
     | 
| 
      
 6739 
     | 
    
         
            +
             
     | 
| 
      
 6740 
     | 
    
         
            +
                    ggml_build_forward_expand(gf, cur);
         
     | 
| 
      
 6741 
     | 
    
         
            +
             
     | 
| 
      
 6742 
     | 
    
         
            +
                    return gf;
         
     | 
| 
      
 6743 
     | 
    
         
            +
                }
         
     | 
| 
      
 6744 
     | 
    
         
            +
             
     | 
| 
       6115 
6745 
     | 
    
         
             
                struct ggml_cgraph * build_falcon() {
         
     | 
| 
       6116 
6746 
     | 
    
         
             
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
       6117 
6747 
     | 
    
         | 
| 
         @@ -6185,6 +6815,14 @@ struct llm_build_context { 
     | 
|
| 
       6185 
6815 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       6186 
6816 
     | 
    
         
             
                        }
         
     | 
| 
       6187 
6817 
     | 
    
         | 
| 
      
 6818 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 6819 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 6820 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 6821 
     | 
    
         
            +
                            cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
         
     | 
| 
      
 6822 
     | 
    
         
            +
                            inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
         
     | 
| 
      
 6823 
     | 
    
         
            +
                            attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
         
     | 
| 
      
 6824 
     | 
    
         
            +
                        }
         
     | 
| 
      
 6825 
     | 
    
         
            +
             
     | 
| 
       6188 
6826 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = cur;
         
     | 
| 
       6189 
6827 
     | 
    
         | 
| 
       6190 
6828 
     | 
    
         
             
                        // feed forward
         
     | 
| 
         @@ -6225,144 +6863,359 @@ struct llm_build_context { 
     | 
|
| 
       6225 
6863 
     | 
    
         
             
                    return gf;
         
     | 
| 
       6226 
6864 
     | 
    
         
             
                }
         
     | 
| 
       6227 
6865 
     | 
    
         | 
| 
       6228 
     | 
    
         
            -
                struct ggml_cgraph *  
     | 
| 
      
 6866 
     | 
    
         
            +
                struct ggml_cgraph * build_grok() {
         
     | 
| 
       6229 
6867 
     | 
    
         
             
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
       6230 
6868 
     | 
    
         | 
| 
      
 6869 
     | 
    
         
            +
                    // mutable variable, needed during the last layer of the computation to skip unused tokens
         
     | 
| 
      
 6870 
     | 
    
         
            +
                    int32_t n_tokens = this->n_tokens;
         
     | 
| 
      
 6871 
     | 
    
         
            +
             
     | 
| 
       6231 
6872 
     | 
    
         
             
                    const int64_t n_embd_head = hparams.n_embd_head_v;
         
     | 
| 
       6232 
     | 
    
         
            -
                    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         
     | 
| 
       6233 
6873 
     | 
    
         
             
                    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         
     | 
| 
      
 6874 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head == hparams.n_rot);
         
     | 
| 
       6234 
6875 
     | 
    
         | 
| 
       6235 
6876 
     | 
    
         
             
                    struct ggml_tensor * cur;
         
     | 
| 
       6236 
6877 
     | 
    
         
             
                    struct ggml_tensor * inpL;
         
     | 
| 
       6237 
6878 
     | 
    
         | 
| 
       6238 
6879 
     | 
    
         
             
                    inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
         
     | 
| 
       6239 
6880 
     | 
    
         | 
| 
      
 6881 
     | 
    
         
            +
                    // multiply by embedding_multiplier_scale of 78.38367176906169
         
     | 
| 
      
 6882 
     | 
    
         
            +
                    inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
         
     | 
| 
      
 6883 
     | 
    
         
            +
             
     | 
| 
       6240 
6884 
     | 
    
         
             
                    // inp_pos - contains the positions
         
     | 
| 
       6241 
6885 
     | 
    
         
             
                    struct ggml_tensor * inp_pos = build_inp_pos();
         
     | 
| 
       6242 
6886 
     | 
    
         | 
| 
       6243 
6887 
     | 
    
         
             
                    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         
     | 
| 
       6244 
6888 
     | 
    
         
             
                    struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
         
     | 
| 
       6245 
6889 
     | 
    
         | 
| 
       6246 
     | 
    
         
            -
                    struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         
     | 
| 
       6247 
     | 
    
         
            -
                    cb(pos, "pos_embd", -1);
         
     | 
| 
       6248 
     | 
    
         
            -
             
     | 
| 
       6249 
     | 
    
         
            -
                    inpL = ggml_add(ctx0, inpL, pos);
         
     | 
| 
       6250 
     | 
    
         
            -
                    cb(inpL, "inpL", -1);
         
     | 
| 
       6251 
     | 
    
         
            -
             
     | 
| 
       6252 
6890 
     | 
    
         
             
                    for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
      
 6891 
     | 
    
         
            +
                        struct ggml_tensor * inpSA = inpL;
         
     | 
| 
      
 6892 
     | 
    
         
            +
             
     | 
| 
      
 6893 
     | 
    
         
            +
                        // norm
         
     | 
| 
       6253 
6894 
     | 
    
         
             
                        cur = llm_build_norm(ctx0, inpL, hparams,
         
     | 
| 
       6254 
     | 
    
         
            -
                                model.layers[il].attn_norm,
         
     | 
| 
       6255 
     | 
    
         
            -
                                 
     | 
| 
       6256 
     | 
    
         
            -
                                LLM_NORM, cb, il);
         
     | 
| 
      
 6895 
     | 
    
         
            +
                                model.layers[il].attn_norm, NULL,
         
     | 
| 
      
 6896 
     | 
    
         
            +
                                LLM_NORM_RMS, cb, il);
         
     | 
| 
       6257 
6897 
     | 
    
         
             
                        cb(cur, "attn_norm", il);
         
     | 
| 
       6258 
6898 
     | 
    
         | 
| 
      
 6899 
     | 
    
         
            +
             
     | 
| 
       6259 
6900 
     | 
    
         
             
                        // self-attention
         
     | 
| 
       6260 
6901 
     | 
    
         
             
                        {
         
     | 
| 
       6261 
     | 
    
         
            -
                             
     | 
| 
       6262 
     | 
    
         
            -
                             
     | 
| 
      
 6902 
     | 
    
         
            +
                            // compute Q and K and RoPE them
         
     | 
| 
      
 6903 
     | 
    
         
            +
                            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
         
     | 
| 
      
 6904 
     | 
    
         
            +
                            cb(Qcur, "Qcur", il);
         
     | 
| 
      
 6905 
     | 
    
         
            +
                            if (model.layers[il].bq) {
         
     | 
| 
      
 6906 
     | 
    
         
            +
                                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
         
     | 
| 
      
 6907 
     | 
    
         
            +
                                cb(Qcur, "Qcur", il);
         
     | 
| 
      
 6908 
     | 
    
         
            +
                            }
         
     | 
| 
       6263 
6909 
     | 
    
         | 
| 
       6264 
     | 
    
         
            -
                             
     | 
| 
       6265 
     | 
    
         
            -
                            cb( 
     | 
| 
      
 6910 
     | 
    
         
            +
                            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
         
     | 
| 
      
 6911 
     | 
    
         
            +
                            cb(Kcur, "Kcur", il);
         
     | 
| 
      
 6912 
     | 
    
         
            +
                            if (model.layers[il].bk) {
         
     | 
| 
      
 6913 
     | 
    
         
            +
                                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
         
     | 
| 
      
 6914 
     | 
    
         
            +
                                cb(Kcur, "Kcur", il);
         
     | 
| 
      
 6915 
     | 
    
         
            +
                            }
         
     | 
| 
       6266 
6916 
     | 
    
         | 
| 
       6267 
     | 
    
         
            -
                            struct ggml_tensor *  
     | 
| 
       6268 
     | 
    
         
            -
                             
     | 
| 
       6269 
     | 
    
         
            -
                             
     | 
| 
      
 6917 
     | 
    
         
            +
                            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
         
     | 
| 
      
 6918 
     | 
    
         
            +
                            cb(Vcur, "Vcur", il);
         
     | 
| 
      
 6919 
     | 
    
         
            +
                            if (model.layers[il].bv) {
         
     | 
| 
      
 6920 
     | 
    
         
            +
                                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
         
     | 
| 
      
 6921 
     | 
    
         
            +
                                cb(Vcur, "Vcur", il);
         
     | 
| 
      
 6922 
     | 
    
         
            +
                            }
         
     | 
| 
       6270 
6923 
     | 
    
         | 
| 
      
 6924 
     | 
    
         
            +
                            Qcur = ggml_rope_custom(
         
     | 
| 
      
 6925 
     | 
    
         
            +
                                ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
         
     | 
| 
      
 6926 
     | 
    
         
            +
                                n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
         
     | 
| 
      
 6927 
     | 
    
         
            +
                                ext_factor, attn_factor, beta_fast, beta_slow
         
     | 
| 
      
 6928 
     | 
    
         
            +
                            );
         
     | 
| 
       6271 
6929 
     | 
    
         
             
                            cb(Qcur, "Qcur", il);
         
     | 
| 
       6272 
     | 
    
         
            -
                            cb(Kcur, "Kcur", il);
         
     | 
| 
       6273 
     | 
    
         
            -
                            cb(Vcur, "Vcur", il);
         
     | 
| 
       6274 
6930 
     | 
    
         | 
| 
       6275 
     | 
    
         
            -
                             
     | 
| 
      
 6931 
     | 
    
         
            +
                            Kcur = ggml_rope_custom(
         
     | 
| 
      
 6932 
     | 
    
         
            +
                                ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
         
     | 
| 
      
 6933 
     | 
    
         
            +
                                n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
         
     | 
| 
      
 6934 
     | 
    
         
            +
                                ext_factor, attn_factor, beta_fast, beta_slow
         
     | 
| 
      
 6935 
     | 
    
         
            +
                            );
         
     | 
| 
      
 6936 
     | 
    
         
            +
                            cb(Kcur, "Kcur", il);
         
     | 
| 
       6276 
6937 
     | 
    
         | 
| 
       6277 
6938 
     | 
    
         
             
                            cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
         
     | 
| 
       6278 
6939 
     | 
    
         
             
                                    model.layers[il].wo, model.layers[il].bo,
         
     | 
| 
       6279 
     | 
    
         
            -
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f 
     | 
| 
      
 6940 
     | 
    
         
            +
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
         
     | 
| 
       6280 
6941 
     | 
    
         
             
                        }
         
     | 
| 
       6281 
6942 
     | 
    
         | 
| 
       6282 
     | 
    
         
            -
                         
     | 
| 
       6283 
     | 
    
         
            -
             
     | 
| 
       6284 
     | 
    
         
            -
             
     | 
| 
       6285 
     | 
    
         
            -
             
     | 
| 
       6286 
     | 
    
         
            -
             
     | 
| 
       6287 
     | 
    
         
            -
             
     | 
| 
       6288 
     | 
    
         
            -
             
     | 
| 
       6289 
     | 
    
         
            -
                                    model.layers[il].ffn_norm,
         
     | 
| 
       6290 
     | 
    
         
            -
                                    model.layers[il].ffn_norm_b,
         
     | 
| 
       6291 
     | 
    
         
            -
                                    LLM_NORM, cb, il);
         
     | 
| 
       6292 
     | 
    
         
            -
                            cb(cur, "ffn_norm", il);
         
     | 
| 
      
 6943 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 6944 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 6945 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 6946 
     | 
    
         
            +
                            n_tokens = n_outputs;
         
     | 
| 
      
 6947 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 6948 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 6949 
     | 
    
         
            +
                        }
         
     | 
| 
       6293 
6950 
     | 
    
         | 
| 
       6294 
     | 
    
         
            -
             
     | 
| 
       6295 
     | 
    
         
            -
             
     | 
| 
       6296 
     | 
    
         
            -
             
     | 
| 
       6297 
     | 
    
         
            -
             
     | 
| 
       6298 
     | 
    
         
            -
                                    NULL,
         
     | 
| 
       6299 
     | 
    
         
            -
                                     
     | 
| 
       6300 
     | 
    
         
            -
                            cb(cur, " 
     | 
| 
      
 6951 
     | 
    
         
            +
                        // Grok
         
     | 
| 
      
 6952 
     | 
    
         
            +
                        // if attn_out_norm is present then apply it before adding the input
         
     | 
| 
      
 6953 
     | 
    
         
            +
                        if (model.layers[il].attn_out_norm) {
         
     | 
| 
      
 6954 
     | 
    
         
            +
                            cur = llm_build_norm(ctx0, cur, hparams,
         
     | 
| 
      
 6955 
     | 
    
         
            +
                                    model.layers[il].attn_out_norm, NULL,
         
     | 
| 
      
 6956 
     | 
    
         
            +
                                    LLM_NORM_RMS, cb, il);
         
     | 
| 
      
 6957 
     | 
    
         
            +
                            cb(cur, "attn_out_norm", il);
         
     | 
| 
       6301 
6958 
     | 
    
         
             
                        }
         
     | 
| 
       6302 
6959 
     | 
    
         | 
| 
       6303 
     | 
    
         
            -
                         
     | 
| 
       6304 
     | 
    
         
            -
                        cb( 
     | 
| 
       6305 
     | 
    
         
            -
                    }
         
     | 
| 
      
 6960 
     | 
    
         
            +
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
      
 6961 
     | 
    
         
            +
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       6306 
6962 
     | 
    
         | 
| 
       6307 
     | 
    
         
            -
             
     | 
| 
       6308 
     | 
    
         
            -
             
     | 
| 
       6309 
     | 
    
         
            -
             
     | 
| 
       6310 
     | 
    
         
            -
             
     | 
| 
       6311 
     | 
    
         
            -
             
     | 
| 
      
 6963 
     | 
    
         
            +
                        // feed-forward network
         
     | 
| 
      
 6964 
     | 
    
         
            +
                        // MoE branch
         
     | 
| 
      
 6965 
     | 
    
         
            +
                        cur = llm_build_norm(ctx0, ffn_inp, hparams,
         
     | 
| 
      
 6966 
     | 
    
         
            +
                                model.layers[il].ffn_norm, NULL,
         
     | 
| 
      
 6967 
     | 
    
         
            +
                                LLM_NORM_RMS, cb, il);
         
     | 
| 
      
 6968 
     | 
    
         
            +
                        cb(cur, "ffn_norm", il);
         
     | 
| 
       6312 
6969 
     | 
    
         | 
| 
       6313 
     | 
    
         
            -
             
     | 
| 
       6314 
     | 
    
         
            -
             
     | 
| 
      
 6970 
     | 
    
         
            +
                        ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
         
     | 
| 
      
 6971 
     | 
    
         
            +
                        cb(logits, "ffn_moe_logits", il);
         
     | 
| 
       6315 
6972 
     | 
    
         | 
| 
       6316 
     | 
    
         
            -
             
     | 
| 
      
 6973 
     | 
    
         
            +
                        ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
         
     | 
| 
      
 6974 
     | 
    
         
            +
                        cb(probs, "ffn_moe_probs", il);
         
     | 
| 
       6317 
6975 
     | 
    
         | 
| 
       6318 
     | 
    
         
            -
             
     | 
| 
       6319 
     | 
    
         
            -
             
     | 
| 
      
 6976 
     | 
    
         
            +
                        // select experts
         
     | 
| 
      
 6977 
     | 
    
         
            +
                        ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
         
     | 
| 
      
 6978 
     | 
    
         
            +
                        cb(selected_experts->src[0], "ffn_moe_argsort", il);
         
     | 
| 
       6320 
6979 
     | 
    
         | 
| 
       6321 
     | 
    
         
            -
             
     | 
| 
       6322 
     | 
    
         
            -
             
     | 
| 
      
 6980 
     | 
    
         
            +
                        ggml_tensor * weights = ggml_get_rows(ctx0,
         
     | 
| 
      
 6981 
     | 
    
         
            +
                                ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
         
     | 
| 
      
 6982 
     | 
    
         
            +
                        cb(weights, "ffn_moe_weights", il);
         
     | 
| 
       6323 
6983 
     | 
    
         | 
| 
       6324 
     | 
    
         
            -
             
     | 
| 
       6325 
     | 
    
         
            -
                    GGML_ASSERT(n_embd_head   == hparams.n_embd_head_k);
         
     | 
| 
       6326 
     | 
    
         
            -
                    GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
         
     | 
| 
      
 6984 
     | 
    
         
            +
                        weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
         
     | 
| 
       6327 
6985 
     | 
    
         | 
| 
       6328 
     | 
    
         
            -
             
     | 
| 
       6329 
     | 
    
         
            -
             
     | 
| 
      
 6986 
     | 
    
         
            +
                        ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
         
     | 
| 
      
 6987 
     | 
    
         
            +
                        cb(weights_sum, "ffn_moe_weights_sum", il);
         
     | 
| 
       6330 
6988 
     | 
    
         | 
| 
       6331 
     | 
    
         
            -
             
     | 
| 
      
 6989 
     | 
    
         
            +
                        weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
         
     | 
| 
      
 6990 
     | 
    
         
            +
                        cb(weights, "ffn_moe_weights_norm", il);
         
     | 
| 
       6332 
6991 
     | 
    
         | 
| 
       6333 
     | 
    
         
            -
             
     | 
| 
       6334 
     | 
    
         
            -
             
     | 
| 
      
 6992 
     | 
    
         
            +
                        // compute expert outputs
         
     | 
| 
      
 6993 
     | 
    
         
            +
                        ggml_tensor * moe_out = nullptr;
         
     | 
| 
       6335 
6994 
     | 
    
         | 
| 
       6336 
     | 
    
         
            -
             
     | 
| 
       6337 
     | 
    
         
            -
             
     | 
| 
      
 6995 
     | 
    
         
            +
                        for (int i = 0; i < n_expert_used; ++i) {
         
     | 
| 
      
 6996 
     | 
    
         
            +
                            ggml_tensor * cur_expert;
         
     | 
| 
       6338 
6997 
     | 
    
         | 
| 
       6339 
     | 
    
         
            -
             
     | 
| 
       6340 
     | 
    
         
            -
             
     | 
| 
      
 6998 
     | 
    
         
            +
                            ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
         
     | 
| 
      
 6999 
     | 
    
         
            +
                            cb(cur_up, "ffn_moe_up", il);
         
     | 
| 
       6341 
7000 
     | 
    
         | 
| 
       6342 
     | 
    
         
            -
             
     | 
| 
       6343 
     | 
    
         
            -
             
     | 
| 
       6344 
     | 
    
         
            -
                                model.layers[il].attn_norm_b,
         
     | 
| 
       6345 
     | 
    
         
            -
                                LLM_NORM, cb, il);
         
     | 
| 
       6346 
     | 
    
         
            -
                        cb(cur, "attn_norm", il);
         
     | 
| 
      
 7001 
     | 
    
         
            +
                            ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
         
     | 
| 
      
 7002 
     | 
    
         
            +
                            cb(cur_gate, "ffn_moe_gate", il);
         
     | 
| 
       6347 
7003 
     | 
    
         | 
| 
       6348 
     | 
    
         
            -
             
     | 
| 
       6349 
     | 
    
         
            -
             
     | 
| 
       6350 
     | 
    
         
            -
                             
     | 
| 
       6351 
     | 
    
         
            -
                            cb(cur, "wqkv", il);
         
     | 
| 
      
 7004 
     | 
    
         
            +
                            //GeLU
         
     | 
| 
      
 7005 
     | 
    
         
            +
                            cur_gate = ggml_gelu(ctx0, cur_gate);
         
     | 
| 
      
 7006 
     | 
    
         
            +
                            cb(cur_gate, "ffn_moe_gelu", il);
         
     | 
| 
       6352 
7007 
     | 
    
         | 
| 
       6353 
     | 
    
         
            -
                             
     | 
| 
       6354 
     | 
    
         
            -
                            cb( 
     | 
| 
      
 7008 
     | 
    
         
            +
                            cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
         
     | 
| 
      
 7009 
     | 
    
         
            +
                            cb(cur_expert, "ffn_moe_gate_par", il);
         
     | 
| 
       6355 
7010 
     | 
    
         | 
| 
       6356 
     | 
    
         
            -
                            //  
     | 
| 
       6357 
     | 
    
         
            -
                             
     | 
| 
      
 7011 
     | 
    
         
            +
                            cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
         
     | 
| 
      
 7012 
     | 
    
         
            +
                            cb(cur_expert, "ffn_moe_down", il);
         
     | 
| 
       6358 
7013 
     | 
    
         | 
| 
       6359 
     | 
    
         
            -
                             
     | 
| 
       6360 
     | 
    
         
            -
             
     | 
| 
      
 7014 
     | 
    
         
            +
                            cur_expert = ggml_mul(ctx0, cur_expert,
         
     | 
| 
      
 7015 
     | 
    
         
            +
                                    ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
         
     | 
| 
      
 7016 
     | 
    
         
            +
                            cb(cur_expert, "ffn_moe_weighted", il);
         
     | 
| 
       6361 
7017 
     | 
    
         | 
| 
       6362 
     | 
    
         
            -
                             
     | 
| 
       6363 
     | 
    
         
            -
             
     | 
| 
      
 7018 
     | 
    
         
            +
                            if (i == 0) {
         
     | 
| 
      
 7019 
     | 
    
         
            +
                                moe_out = cur_expert;
         
     | 
| 
      
 7020 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 7021 
     | 
    
         
            +
                                moe_out = ggml_add(ctx0, moe_out, cur_expert);
         
     | 
| 
      
 7022 
     | 
    
         
            +
                                cb(moe_out, "ffn_moe_out", il);
         
     | 
| 
      
 7023 
     | 
    
         
            +
                            }
         
     | 
| 
      
 7024 
     | 
    
         
            +
                        }
         
     | 
| 
       6364 
7025 
     | 
    
         | 
| 
       6365 
     | 
    
         
            -
             
     | 
| 
      
 7026 
     | 
    
         
            +
                        cur = moe_out;
         
     | 
| 
      
 7027 
     | 
    
         
            +
             
     | 
| 
      
 7028 
     | 
    
         
            +
                        // Grok
         
     | 
| 
      
 7029 
     | 
    
         
            +
                        // if layer_out_norm is present then apply it before adding the input
         
     | 
| 
      
 7030 
     | 
    
         
            +
                        // Idea: maybe ffn_out_norm is a better name
         
     | 
| 
      
 7031 
     | 
    
         
            +
                        if (model.layers[il].layer_out_norm) {
         
     | 
| 
      
 7032 
     | 
    
         
            +
                            cur = llm_build_norm(ctx0, cur, hparams,
         
     | 
| 
      
 7033 
     | 
    
         
            +
                                    model.layers[il].layer_out_norm, NULL,
         
     | 
| 
      
 7034 
     | 
    
         
            +
                                    LLM_NORM_RMS, cb, il);
         
     | 
| 
      
 7035 
     | 
    
         
            +
                            cb(cur, "layer_out_norm", il);
         
     | 
| 
      
 7036 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7037 
     | 
    
         
            +
             
     | 
| 
      
 7038 
     | 
    
         
            +
             
     | 
| 
      
 7039 
     | 
    
         
            +
                        cur = ggml_add(ctx0, cur, ffn_inp);
         
     | 
| 
      
 7040 
     | 
    
         
            +
                        cb(cur, "ffn_out", il);
         
     | 
| 
      
 7041 
     | 
    
         
            +
             
     | 
| 
      
 7042 
     | 
    
         
            +
                        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
         
     | 
| 
      
 7043 
     | 
    
         
            +
                        if (layer_dir != nullptr) {
         
     | 
| 
      
 7044 
     | 
    
         
            +
                            cur = ggml_add(ctx0, cur, layer_dir);
         
     | 
| 
      
 7045 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7046 
     | 
    
         
            +
                        cb(cur, "l_out", il);
         
     | 
| 
      
 7047 
     | 
    
         
            +
             
     | 
| 
      
 7048 
     | 
    
         
            +
                        // input for next layer
         
     | 
| 
      
 7049 
     | 
    
         
            +
                        inpL = cur;
         
     | 
| 
      
 7050 
     | 
    
         
            +
                    }
         
     | 
| 
      
 7051 
     | 
    
         
            +
             
     | 
| 
      
 7052 
     | 
    
         
            +
                    cur = inpL;
         
     | 
| 
      
 7053 
     | 
    
         
            +
             
     | 
| 
      
 7054 
     | 
    
         
            +
                    cur = llm_build_norm(ctx0, cur, hparams,
         
     | 
| 
      
 7055 
     | 
    
         
            +
                            model.output_norm, NULL,
         
     | 
| 
      
 7056 
     | 
    
         
            +
                            LLM_NORM_RMS, cb, -1);
         
     | 
| 
      
 7057 
     | 
    
         
            +
                    cb(cur, "result_norm", -1);
         
     | 
| 
      
 7058 
     | 
    
         
            +
             
     | 
| 
      
 7059 
     | 
    
         
            +
                    // lm_head
         
     | 
| 
      
 7060 
     | 
    
         
            +
                    cur = ggml_mul_mat(ctx0, model.output, cur);
         
     | 
| 
      
 7061 
     | 
    
         
            +
             
     | 
| 
      
 7062 
     | 
    
         
            +
                    // Grok
         
     | 
| 
      
 7063 
     | 
    
         
            +
                    // multiply logits by output_multiplier_scale of 0.5773502691896257
         
     | 
| 
      
 7064 
     | 
    
         
            +
             
     | 
| 
      
 7065 
     | 
    
         
            +
                    cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
         
     | 
| 
      
 7066 
     | 
    
         
            +
             
     | 
| 
      
 7067 
     | 
    
         
            +
                    cb(cur, "result_output", -1);
         
     | 
| 
      
 7068 
     | 
    
         
            +
             
     | 
| 
      
 7069 
     | 
    
         
            +
                    ggml_build_forward_expand(gf, cur);
         
     | 
| 
      
 7070 
     | 
    
         
            +
             
     | 
| 
      
 7071 
     | 
    
         
            +
                    return gf;
         
     | 
| 
      
 7072 
     | 
    
         
            +
                }
         
     | 
| 
      
 7073 
     | 
    
         
            +
             
     | 
| 
      
 7074 
     | 
    
         
            +
                struct ggml_cgraph * build_starcoder() {
         
     | 
| 
      
 7075 
     | 
    
         
            +
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
      
 7076 
     | 
    
         
            +
             
     | 
| 
      
 7077 
     | 
    
         
            +
                    const int64_t n_embd_head = hparams.n_embd_head_v;
         
     | 
| 
      
 7078 
     | 
    
         
            +
                    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         
     | 
| 
      
 7079 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         
     | 
| 
      
 7080 
     | 
    
         
            +
             
     | 
| 
      
 7081 
     | 
    
         
            +
                    struct ggml_tensor * cur;
         
     | 
| 
      
 7082 
     | 
    
         
            +
                    struct ggml_tensor * inpL;
         
     | 
| 
      
 7083 
     | 
    
         
            +
             
     | 
| 
      
 7084 
     | 
    
         
            +
                    inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
         
     | 
| 
      
 7085 
     | 
    
         
            +
             
     | 
| 
      
 7086 
     | 
    
         
            +
                    // inp_pos - contains the positions
         
     | 
| 
      
 7087 
     | 
    
         
            +
                    struct ggml_tensor * inp_pos = build_inp_pos();
         
     | 
| 
      
 7088 
     | 
    
         
            +
             
     | 
| 
      
 7089 
     | 
    
         
            +
                    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         
     | 
| 
      
 7090 
     | 
    
         
            +
                    struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
         
     | 
| 
      
 7091 
     | 
    
         
            +
             
     | 
| 
      
 7092 
     | 
    
         
            +
                    struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         
     | 
| 
      
 7093 
     | 
    
         
            +
                    cb(pos, "pos_embd", -1);
         
     | 
| 
      
 7094 
     | 
    
         
            +
             
     | 
| 
      
 7095 
     | 
    
         
            +
                    inpL = ggml_add(ctx0, inpL, pos);
         
     | 
| 
      
 7096 
     | 
    
         
            +
                    cb(inpL, "inpL", -1);
         
     | 
| 
      
 7097 
     | 
    
         
            +
             
     | 
| 
      
 7098 
     | 
    
         
            +
                    for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
      
 7099 
     | 
    
         
            +
                        cur = llm_build_norm(ctx0, inpL, hparams,
         
     | 
| 
      
 7100 
     | 
    
         
            +
                                model.layers[il].attn_norm,
         
     | 
| 
      
 7101 
     | 
    
         
            +
                                model.layers[il].attn_norm_b,
         
     | 
| 
      
 7102 
     | 
    
         
            +
                                LLM_NORM, cb, il);
         
     | 
| 
      
 7103 
     | 
    
         
            +
                        cb(cur, "attn_norm", il);
         
     | 
| 
      
 7104 
     | 
    
         
            +
             
     | 
| 
      
 7105 
     | 
    
         
            +
                        // self-attention
         
     | 
| 
      
 7106 
     | 
    
         
            +
                        {
         
     | 
| 
      
 7107 
     | 
    
         
            +
                            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
         
     | 
| 
      
 7108 
     | 
    
         
            +
                            cb(cur, "wqkv", il);
         
     | 
| 
      
 7109 
     | 
    
         
            +
             
     | 
| 
      
 7110 
     | 
    
         
            +
                            cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
         
     | 
| 
      
 7111 
     | 
    
         
            +
                            cb(cur, "bqkv", il);
         
     | 
| 
      
 7112 
     | 
    
         
            +
             
     | 
| 
      
 7113 
     | 
    
         
            +
                            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
         
     | 
| 
      
 7114 
     | 
    
         
            +
                            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
         
     | 
| 
      
 7115 
     | 
    
         
            +
                            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
         
     | 
| 
      
 7116 
     | 
    
         
            +
             
     | 
| 
      
 7117 
     | 
    
         
            +
                            cb(Qcur, "Qcur", il);
         
     | 
| 
      
 7118 
     | 
    
         
            +
                            cb(Kcur, "Kcur", il);
         
     | 
| 
      
 7119 
     | 
    
         
            +
                            cb(Vcur, "Vcur", il);
         
     | 
| 
      
 7120 
     | 
    
         
            +
             
     | 
| 
      
 7121 
     | 
    
         
            +
                            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
         
     | 
| 
      
 7122 
     | 
    
         
            +
             
     | 
| 
      
 7123 
     | 
    
         
            +
                            cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
         
     | 
| 
      
 7124 
     | 
    
         
            +
                                    model.layers[il].wo, model.layers[il].bo,
         
     | 
| 
      
 7125 
     | 
    
         
            +
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
      
 7126 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7127 
     | 
    
         
            +
             
     | 
| 
      
 7128 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7129 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7130 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7131 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 7132 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 7133 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7134 
     | 
    
         
            +
             
     | 
| 
      
 7135 
     | 
    
         
            +
                        // add the input
         
     | 
| 
      
 7136 
     | 
    
         
            +
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
         
     | 
| 
      
 7137 
     | 
    
         
            +
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
      
 7138 
     | 
    
         
            +
             
     | 
| 
      
 7139 
     | 
    
         
            +
                        // FF
         
     | 
| 
      
 7140 
     | 
    
         
            +
                        {
         
     | 
| 
      
 7141 
     | 
    
         
            +
                            cur = llm_build_norm(ctx0, ffn_inp, hparams,
         
     | 
| 
      
 7142 
     | 
    
         
            +
                                    model.layers[il].ffn_norm,
         
     | 
| 
      
 7143 
     | 
    
         
            +
                                    model.layers[il].ffn_norm_b,
         
     | 
| 
      
 7144 
     | 
    
         
            +
                                    LLM_NORM, cb, il);
         
     | 
| 
      
 7145 
     | 
    
         
            +
                            cb(cur, "ffn_norm", il);
         
     | 
| 
      
 7146 
     | 
    
         
            +
             
     | 
| 
      
 7147 
     | 
    
         
            +
                            cur = llm_build_ffn(ctx0, cur,
         
     | 
| 
      
 7148 
     | 
    
         
            +
                                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
         
     | 
| 
      
 7149 
     | 
    
         
            +
                                    NULL,                      NULL,
         
     | 
| 
      
 7150 
     | 
    
         
            +
                                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
         
     | 
| 
      
 7151 
     | 
    
         
            +
                                    NULL,
         
     | 
| 
      
 7152 
     | 
    
         
            +
                                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
         
     | 
| 
      
 7153 
     | 
    
         
            +
                            cb(cur, "ffn_out", il);
         
     | 
| 
      
 7154 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7155 
     | 
    
         
            +
             
     | 
| 
      
 7156 
     | 
    
         
            +
                        inpL = ggml_add(ctx0, cur, ffn_inp);
         
     | 
| 
      
 7157 
     | 
    
         
            +
                        cb(inpL, "l_out", il);
         
     | 
| 
      
 7158 
     | 
    
         
            +
                    }
         
     | 
| 
      
 7159 
     | 
    
         
            +
             
     | 
| 
      
 7160 
     | 
    
         
            +
                    cur = llm_build_norm(ctx0, inpL, hparams,
         
     | 
| 
      
 7161 
     | 
    
         
            +
                            model.output_norm,
         
     | 
| 
      
 7162 
     | 
    
         
            +
                            model.output_norm_b,
         
     | 
| 
      
 7163 
     | 
    
         
            +
                            LLM_NORM, cb, -1);
         
     | 
| 
      
 7164 
     | 
    
         
            +
                    cb(cur, "result_norm", -1);
         
     | 
| 
      
 7165 
     | 
    
         
            +
             
     | 
| 
      
 7166 
     | 
    
         
            +
                    cur = ggml_mul_mat(ctx0, model.output, cur);
         
     | 
| 
      
 7167 
     | 
    
         
            +
                    cb(cur, "result_output", -1);
         
     | 
| 
      
 7168 
     | 
    
         
            +
             
     | 
| 
      
 7169 
     | 
    
         
            +
                    ggml_build_forward_expand(gf, cur);
         
     | 
| 
      
 7170 
     | 
    
         
            +
             
     | 
| 
      
 7171 
     | 
    
         
            +
                    return gf;
         
     | 
| 
      
 7172 
     | 
    
         
            +
                }
         
     | 
| 
      
 7173 
     | 
    
         
            +
             
     | 
| 
      
 7174 
     | 
    
         
            +
                struct ggml_cgraph * build_persimmon() {
         
     | 
| 
      
 7175 
     | 
    
         
            +
                    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
         
     | 
| 
      
 7176 
     | 
    
         
            +
             
     | 
| 
      
 7177 
     | 
    
         
            +
                    const int64_t n_embd_head = hparams.n_embd_head_v;
         
     | 
| 
      
 7178 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head   == hparams.n_embd_head_k);
         
     | 
| 
      
 7179 
     | 
    
         
            +
                    GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
         
     | 
| 
      
 7180 
     | 
    
         
            +
             
     | 
| 
      
 7181 
     | 
    
         
            +
                    struct ggml_tensor * cur;
         
     | 
| 
      
 7182 
     | 
    
         
            +
                    struct ggml_tensor * inpL;
         
     | 
| 
      
 7183 
     | 
    
         
            +
             
     | 
| 
      
 7184 
     | 
    
         
            +
                    inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
         
     | 
| 
      
 7185 
     | 
    
         
            +
             
     | 
| 
      
 7186 
     | 
    
         
            +
                    // inp_pos - contains the positions
         
     | 
| 
      
 7187 
     | 
    
         
            +
                    struct ggml_tensor * inp_pos = build_inp_pos();
         
     | 
| 
      
 7188 
     | 
    
         
            +
             
     | 
| 
      
 7189 
     | 
    
         
            +
                    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         
     | 
| 
      
 7190 
     | 
    
         
            +
                    struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
         
     | 
| 
      
 7191 
     | 
    
         
            +
             
     | 
| 
      
 7192 
     | 
    
         
            +
                    for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
      
 7193 
     | 
    
         
            +
                        struct ggml_tensor * residual = inpL;
         
     | 
| 
      
 7194 
     | 
    
         
            +
             
     | 
| 
      
 7195 
     | 
    
         
            +
                        cur = llm_build_norm(ctx0, inpL, hparams,
         
     | 
| 
      
 7196 
     | 
    
         
            +
                                model.layers[il].attn_norm,
         
     | 
| 
      
 7197 
     | 
    
         
            +
                                model.layers[il].attn_norm_b,
         
     | 
| 
      
 7198 
     | 
    
         
            +
                                LLM_NORM, cb, il);
         
     | 
| 
      
 7199 
     | 
    
         
            +
                        cb(cur, "attn_norm", il);
         
     | 
| 
      
 7200 
     | 
    
         
            +
             
     | 
| 
      
 7201 
     | 
    
         
            +
                        // self attention
         
     | 
| 
      
 7202 
     | 
    
         
            +
                        {
         
     | 
| 
      
 7203 
     | 
    
         
            +
                            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
         
     | 
| 
      
 7204 
     | 
    
         
            +
                            cb(cur, "wqkv", il);
         
     | 
| 
      
 7205 
     | 
    
         
            +
             
     | 
| 
      
 7206 
     | 
    
         
            +
                            cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
         
     | 
| 
      
 7207 
     | 
    
         
            +
                            cb(cur, "bqkv", il);
         
     | 
| 
      
 7208 
     | 
    
         
            +
             
     | 
| 
      
 7209 
     | 
    
         
            +
                            // split qkv
         
     | 
| 
      
 7210 
     | 
    
         
            +
                            GGML_ASSERT(n_head_kv == n_head);
         
     | 
| 
      
 7211 
     | 
    
         
            +
             
     | 
| 
      
 7212 
     | 
    
         
            +
                            struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
         
     | 
| 
      
 7213 
     | 
    
         
            +
                            cb(tmpqkv, "tmpqkv", il);
         
     | 
| 
      
 7214 
     | 
    
         
            +
             
     | 
| 
      
 7215 
     | 
    
         
            +
                            struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
         
     | 
| 
      
 7216 
     | 
    
         
            +
                            cb(tmpqkv_perm, "tmpqkv", il);
         
     | 
| 
      
 7217 
     | 
    
         
            +
             
     | 
| 
      
 7218 
     | 
    
         
            +
                            struct ggml_tensor * tmpq = ggml_view_3d(
         
     | 
| 
       6366 
7219 
     | 
    
         
             
                                    ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
         
     | 
| 
       6367 
7220 
     | 
    
         
             
                                    ggml_element_size(tmpqkv_perm) * n_embd_head,
         
     | 
| 
       6368 
7221 
     | 
    
         
             
                                    ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
         
     | 
| 
         @@ -6476,6 +7329,13 @@ struct llm_build_context { 
     | 
|
| 
       6476 
7329 
     | 
    
         
             
                                    Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       6477 
7330 
     | 
    
         
             
                        }
         
     | 
| 
       6478 
7331 
     | 
    
         | 
| 
      
 7332 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7333 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7334 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7335 
     | 
    
         
            +
                            cur      = ggml_get_rows(ctx0,      cur, inp_out_ids);
         
     | 
| 
      
 7336 
     | 
    
         
            +
                            residual = ggml_get_rows(ctx0, residual, inp_out_ids);
         
     | 
| 
      
 7337 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7338 
     | 
    
         
            +
             
     | 
| 
       6479 
7339 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
         
     | 
| 
       6480 
7340 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       6481 
7341 
     | 
    
         | 
| 
         @@ -6565,6 +7425,13 @@ struct llm_build_context { 
     | 
|
| 
       6565 
7425 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       6566 
7426 
     | 
    
         
             
                        }
         
     | 
| 
       6567 
7427 
     | 
    
         | 
| 
      
 7428 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7429 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7430 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7431 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 7432 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 7433 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7434 
     | 
    
         
            +
             
     | 
| 
       6568 
7435 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       6569 
7436 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       6570 
7437 
     | 
    
         | 
| 
         @@ -6722,6 +7589,13 @@ struct llm_build_context { 
     | 
|
| 
       6722 
7589 
     | 
    
         
             
                        }
         
     | 
| 
       6723 
7590 
     | 
    
         
             
                        cb(cur, "kqv_out", il);
         
     | 
| 
       6724 
7591 
     | 
    
         | 
| 
      
 7592 
     | 
    
         
            +
                        if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
         
     | 
| 
      
 7593 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7594 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7595 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 7596 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 7597 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7598 
     | 
    
         
            +
             
     | 
| 
       6725 
7599 
     | 
    
         
             
                        // re-add the layer input
         
     | 
| 
       6726 
7600 
     | 
    
         
             
                        cur = ggml_add(ctx0, cur, inpL);
         
     | 
| 
       6727 
7601 
     | 
    
         | 
| 
         @@ -6844,6 +7718,13 @@ struct llm_build_context { 
     | 
|
| 
       6844 
7718 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       6845 
7719 
     | 
    
         
             
                        }
         
     | 
| 
       6846 
7720 
     | 
    
         | 
| 
      
 7721 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7722 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7723 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7724 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 7725 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 7726 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7727 
     | 
    
         
            +
             
     | 
| 
       6847 
7728 
     | 
    
         
             
                        // Add the input
         
     | 
| 
       6848 
7729 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
         
     | 
| 
       6849 
7730 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
         @@ -6891,6 +7772,7 @@ struct llm_build_context { 
     | 
|
| 
       6891 
7772 
     | 
    
         
             
                    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         
     | 
| 
       6892 
7773 
     | 
    
         | 
| 
       6893 
7774 
     | 
    
         
             
                    struct ggml_tensor * cur;
         
     | 
| 
      
 7775 
     | 
    
         
            +
                    struct ggml_tensor * pos;
         
     | 
| 
       6894 
7776 
     | 
    
         
             
                    struct ggml_tensor * inpL;
         
     | 
| 
       6895 
7777 
     | 
    
         | 
| 
       6896 
7778 
     | 
    
         
             
                    inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
         
     | 
| 
         @@ -6901,6 +7783,16 @@ struct llm_build_context { 
     | 
|
| 
       6901 
7783 
     | 
    
         
             
                    // positions of the tokens in the KV cache
         
     | 
| 
       6902 
7784 
     | 
    
         
             
                    struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
         
     | 
| 
       6903 
7785 
     | 
    
         | 
| 
      
 7786 
     | 
    
         
            +
                    if (model.pos_embd) {
         
     | 
| 
      
 7787 
     | 
    
         
            +
                        // inp_pos - contains the positions
         
     | 
| 
      
 7788 
     | 
    
         
            +
                        struct ggml_tensor * inp_pos = build_inp_pos();
         
     | 
| 
      
 7789 
     | 
    
         
            +
                        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         
     | 
| 
      
 7790 
     | 
    
         
            +
                        cb(pos, "pos_embd", -1);
         
     | 
| 
      
 7791 
     | 
    
         
            +
             
     | 
| 
      
 7792 
     | 
    
         
            +
                        inpL = ggml_add(ctx0, inpL, pos);
         
     | 
| 
      
 7793 
     | 
    
         
            +
                        cb(inpL, "inpL", -1);
         
     | 
| 
      
 7794 
     | 
    
         
            +
                    }
         
     | 
| 
      
 7795 
     | 
    
         
            +
             
     | 
| 
       6904 
7796 
     | 
    
         
             
                    for (int il = 0; il < n_layer; ++il) {
         
     | 
| 
       6905 
7797 
     | 
    
         
             
                        struct ggml_tensor * attn_norm;
         
     | 
| 
       6906 
7798 
     | 
    
         | 
| 
         @@ -6935,11 +7827,39 @@ struct llm_build_context { 
     | 
|
| 
       6935 
7827 
     | 
    
         
             
                            cb(Kcur, "Kcur", il);
         
     | 
| 
       6936 
7828 
     | 
    
         
             
                            cb(Vcur, "Vcur", il);
         
     | 
| 
       6937 
7829 
     | 
    
         | 
| 
       6938 
     | 
    
         
            -
                             
     | 
| 
      
 7830 
     | 
    
         
            +
                            // Q/K Layernorm
         
     | 
| 
      
 7831 
     | 
    
         
            +
                            if (model.layers[il].attn_q_norm) {
         
     | 
| 
      
 7832 
     | 
    
         
            +
                                Qcur = llm_build_norm(ctx0, Qcur, hparams,
         
     | 
| 
      
 7833 
     | 
    
         
            +
                                        model.layers[il].attn_q_norm,
         
     | 
| 
      
 7834 
     | 
    
         
            +
                                        model.layers[il].attn_q_norm_b,
         
     | 
| 
      
 7835 
     | 
    
         
            +
                                        LLM_NORM, cb, il);
         
     | 
| 
      
 7836 
     | 
    
         
            +
                                cb(Qcur, "Qcur", il);
         
     | 
| 
       6939 
7837 
     | 
    
         | 
| 
       6940 
     | 
    
         
            -
             
     | 
| 
      
 7838 
     | 
    
         
            +
                                Kcur = llm_build_norm(ctx0, Kcur, hparams,
         
     | 
| 
      
 7839 
     | 
    
         
            +
                                        model.layers[il].attn_k_norm,
         
     | 
| 
      
 7840 
     | 
    
         
            +
                                        model.layers[il].attn_k_norm_b,
         
     | 
| 
      
 7841 
     | 
    
         
            +
                                        LLM_NORM, cb, il);
         
     | 
| 
      
 7842 
     | 
    
         
            +
                                cb(Kcur, "Kcur", il);
         
     | 
| 
      
 7843 
     | 
    
         
            +
             
     | 
| 
      
 7844 
     | 
    
         
            +
                                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
         
     | 
| 
      
 7845 
     | 
    
         
            +
                                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
         
     | 
| 
      
 7846 
     | 
    
         
            +
             
     | 
| 
      
 7847 
     | 
    
         
            +
                                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
         
     | 
| 
       6941 
7848 
     | 
    
         
             
                                    model.layers[il].wo, model.layers[il].bo,
         
     | 
| 
       6942 
     | 
    
         
            -
                                    Kcur, Vcur, Qcur, KQ_mask,  
     | 
| 
      
 7849 
     | 
    
         
            +
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
      
 7850 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 7851 
     | 
    
         
            +
                                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
         
     | 
| 
      
 7852 
     | 
    
         
            +
                                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
         
     | 
| 
      
 7853 
     | 
    
         
            +
                                        model.layers[il].wo, model.layers[il].bo,
         
     | 
| 
      
 7854 
     | 
    
         
            +
                                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
      
 7855 
     | 
    
         
            +
                            }
         
     | 
| 
      
 7856 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7857 
     | 
    
         
            +
             
     | 
| 
      
 7858 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7859 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7860 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7861 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 7862 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
       6943 
7863 
     | 
    
         
             
                        }
         
     | 
| 
       6944 
7864 
     | 
    
         | 
| 
       6945 
7865 
     | 
    
         
             
                        // Add the input
         
     | 
| 
         @@ -7055,6 +7975,13 @@ struct llm_build_context { 
     | 
|
| 
       7055 
7975 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7056 
7976 
     | 
    
         
             
                        }
         
     | 
| 
       7057 
7977 
     | 
    
         | 
| 
      
 7978 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 7979 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 7980 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 7981 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 7982 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 7983 
     | 
    
         
            +
                        }
         
     | 
| 
      
 7984 
     | 
    
         
            +
             
     | 
| 
       7058 
7985 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       7059 
7986 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       7060 
7987 
     | 
    
         | 
| 
         @@ -7161,6 +8088,13 @@ struct llm_build_context { 
     | 
|
| 
       7161 
8088 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7162 
8089 
     | 
    
         
             
                        }
         
     | 
| 
       7163 
8090 
     | 
    
         | 
| 
      
 8091 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8092 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8093 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8094 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 8095 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 8096 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8097 
     | 
    
         
            +
             
     | 
| 
       7164 
8098 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       7165 
8099 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       7166 
8100 
     | 
    
         | 
| 
         @@ -7273,6 +8207,13 @@ struct llm_build_context { 
     | 
|
| 
       7273 
8207 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7274 
8208 
     | 
    
         
             
                        }
         
     | 
| 
       7275 
8209 
     | 
    
         | 
| 
      
 8210 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8211 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8212 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8213 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 8214 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 8215 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8216 
     | 
    
         
            +
             
     | 
| 
       7276 
8217 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       7277 
8218 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       7278 
8219 
     | 
    
         | 
| 
         @@ -7391,6 +8332,14 @@ struct llm_build_context { 
     | 
|
| 
       7391 
8332 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
         
     | 
| 
       7392 
8333 
     | 
    
         
             
                        }
         
     | 
| 
       7393 
8334 
     | 
    
         | 
| 
      
 8335 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8336 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8337 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8338 
     | 
    
         
            +
                            cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
         
     | 
| 
      
 8339 
     | 
    
         
            +
                            inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
         
     | 
| 
      
 8340 
     | 
    
         
            +
                            attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
         
     | 
| 
      
 8341 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8342 
     | 
    
         
            +
             
     | 
| 
       7394 
8343 
     | 
    
         
             
                        // FF
         
     | 
| 
       7395 
8344 
     | 
    
         
             
                        {
         
     | 
| 
       7396 
8345 
     | 
    
         
             
                            ffn_output = llm_build_ffn(ctx0, attn_norm_output,
         
     | 
| 
         @@ -7488,6 +8437,14 @@ struct llm_build_context { 
     | 
|
| 
       7488 
8437 
     | 
    
         | 
| 
       7489 
8438 
     | 
    
         
             
                        cur = attention_norm;
         
     | 
| 
       7490 
8439 
     | 
    
         | 
| 
      
 8440 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8441 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8442 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8443 
     | 
    
         
            +
                            cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
         
     | 
| 
      
 8444 
     | 
    
         
            +
                            sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
         
     | 
| 
      
 8445 
     | 
    
         
            +
                            inpL   = ggml_get_rows(ctx0,   inpL, inp_out_ids);
         
     | 
| 
      
 8446 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8447 
     | 
    
         
            +
             
     | 
| 
       7491 
8448 
     | 
    
         
             
                        // feed-forward network
         
     | 
| 
       7492 
8449 
     | 
    
         
             
                        {
         
     | 
| 
       7493 
8450 
     | 
    
         
             
                            cur = llm_build_ffn(ctx0, cur,
         
     | 
| 
         @@ -7580,6 +8537,13 @@ struct llm_build_context { 
     | 
|
| 
       7580 
8537 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7581 
8538 
     | 
    
         
             
                        }
         
     | 
| 
       7582 
8539 
     | 
    
         | 
| 
      
 8540 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8541 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8542 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8543 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 8544 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 8545 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8546 
     | 
    
         
            +
             
     | 
| 
       7583 
8547 
     | 
    
         
             
                        // add the input
         
     | 
| 
       7584 
8548 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
         
     | 
| 
       7585 
8549 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
         @@ -7680,6 +8644,13 @@ struct llm_build_context { 
     | 
|
| 
       7680 
8644 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7681 
8645 
     | 
    
         
             
                        }
         
     | 
| 
       7682 
8646 
     | 
    
         | 
| 
      
 8647 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8648 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8649 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8650 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 8651 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 8652 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8653 
     | 
    
         
            +
             
     | 
| 
       7683 
8654 
     | 
    
         
             
                        // add the input
         
     | 
| 
       7684 
8655 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
         
     | 
| 
       7685 
8656 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
         @@ -7789,6 +8760,13 @@ struct llm_build_context { 
     | 
|
| 
       7789 
8760 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7790 
8761 
     | 
    
         
             
                        }
         
     | 
| 
       7791 
8762 
     | 
    
         | 
| 
      
 8763 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8764 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8765 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8766 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 8767 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 8768 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8769 
     | 
    
         
            +
             
     | 
| 
       7792 
8770 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       7793 
8771 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       7794 
8772 
     | 
    
         | 
| 
         @@ -7899,6 +8877,13 @@ struct llm_build_context { 
     | 
|
| 
       7899 
8877 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       7900 
8878 
     | 
    
         
             
                        }
         
     | 
| 
       7901 
8879 
     | 
    
         | 
| 
      
 8880 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 8881 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 8882 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 8883 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 8884 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 8885 
     | 
    
         
            +
                        }
         
     | 
| 
      
 8886 
     | 
    
         
            +
             
     | 
| 
       7902 
8887 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       7903 
8888 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       7904 
8889 
     | 
    
         | 
| 
         @@ -8022,6 +9007,13 @@ struct llm_build_context { 
     | 
|
| 
       8022 
9007 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       8023 
9008 
     | 
    
         
             
                        }
         
     | 
| 
       8024 
9009 
     | 
    
         | 
| 
      
 9010 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 9011 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 9012 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 9013 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 9014 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 9015 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9016 
     | 
    
         
            +
             
     | 
| 
       8025 
9017 
     | 
    
         
             
                        // scale_res - scale the hidden states for residual connection
         
     | 
| 
       8026 
9018 
     | 
    
         
             
                        const float scale_res = scale_depth/sqrtf(float(n_layer));
         
     | 
| 
       8027 
9019 
     | 
    
         
             
                        cur = ggml_scale(ctx0, cur, scale_res);
         
     | 
| 
         @@ -8136,6 +9128,13 @@ struct llm_build_context { 
     | 
|
| 
       8136 
9128 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
         
     | 
| 
       8137 
9129 
     | 
    
         
             
                        }
         
     | 
| 
       8138 
9130 
     | 
    
         | 
| 
      
 9131 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 9132 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 9133 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 9134 
     | 
    
         
            +
                            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
         
     | 
| 
      
 9135 
     | 
    
         
            +
                            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 9136 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9137 
     | 
    
         
            +
             
     | 
| 
       8139 
9138 
     | 
    
         
             
                        struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
         
     | 
| 
       8140 
9139 
     | 
    
         
             
                        cb(sa_out, "sa_out", il);
         
     | 
| 
       8141 
9140 
     | 
    
         | 
| 
         @@ -8248,6 +9247,13 @@ struct llm_build_context { 
     | 
|
| 
       8248 
9247 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       8249 
9248 
     | 
    
         
             
                        }
         
     | 
| 
       8250 
9249 
     | 
    
         | 
| 
      
 9250 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 9251 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 9252 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 9253 
     | 
    
         
            +
                            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
         
     | 
| 
      
 9254 
     | 
    
         
            +
                            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         
     | 
| 
      
 9255 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9256 
     | 
    
         
            +
             
     | 
| 
       8251 
9257 
     | 
    
         
             
                        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         
     | 
| 
       8252 
9258 
     | 
    
         
             
                        cb(ffn_inp, "ffn_inp", il);
         
     | 
| 
       8253 
9259 
     | 
    
         | 
| 
         @@ -8395,6 +9401,15 @@ struct llm_build_context { 
     | 
|
| 
       8395 
9401 
     | 
    
         | 
| 
       8396 
9402 
     | 
    
         
             
                            struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
         
     | 
| 
       8397 
9403 
     | 
    
         | 
| 
      
 9404 
     | 
    
         
            +
                            if (il == n_layer - 1) {
         
     | 
| 
      
 9405 
     | 
    
         
            +
                                // skip computing output for unused tokens
         
     | 
| 
      
 9406 
     | 
    
         
            +
                                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 9407 
     | 
    
         
            +
                                x    = ggml_get_rows(ctx0,    x, inp_out_ids);
         
     | 
| 
      
 9408 
     | 
    
         
            +
                                y    = ggml_get_rows(ctx0,    y, inp_out_ids);
         
     | 
| 
      
 9409 
     | 
    
         
            +
                                z    = ggml_get_rows(ctx0,    z, inp_out_ids);
         
     | 
| 
      
 9410 
     | 
    
         
            +
                                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
         
     | 
| 
      
 9411 
     | 
    
         
            +
                            }
         
     | 
| 
      
 9412 
     | 
    
         
            +
             
     | 
| 
       8398 
9413 
     | 
    
         
             
                            // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
         
     | 
| 
       8399 
9414 
     | 
    
         
             
                            y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
         
     | 
| 
       8400 
9415 
     | 
    
         
             
                            y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
         
     | 
| 
         @@ -8478,6 +9493,31 @@ struct llm_build_context { 
     | 
|
| 
       8478 
9493 
     | 
    
         
             
                                cb(Vcur, "Vcur", il);
         
     | 
| 
       8479 
9494 
     | 
    
         
             
                            }
         
     | 
| 
       8480 
9495 
     | 
    
         | 
| 
      
 9496 
     | 
    
         
            +
                            if (model.layers[il].attn_q_norm) {
         
     | 
| 
      
 9497 
     | 
    
         
            +
                                Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
         
     | 
| 
      
 9498 
     | 
    
         
            +
                                            ggml_element_size(Qcur) * n_embd_head,
         
     | 
| 
      
 9499 
     | 
    
         
            +
                                            ggml_element_size(Qcur) * n_embd_head * n_head,
         
     | 
| 
      
 9500 
     | 
    
         
            +
                                            0);
         
     | 
| 
      
 9501 
     | 
    
         
            +
                                cb(Qcur, "Qcur", il);
         
     | 
| 
      
 9502 
     | 
    
         
            +
                                Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
         
     | 
| 
      
 9503 
     | 
    
         
            +
                                            ggml_element_size(Kcur) * n_embd_head,
         
     | 
| 
      
 9504 
     | 
    
         
            +
                                            ggml_element_size(Kcur) * n_embd_head * n_head_kv,
         
     | 
| 
      
 9505 
     | 
    
         
            +
                                            0);
         
     | 
| 
      
 9506 
     | 
    
         
            +
                                cb(Kcur, "Kcur", il);
         
     | 
| 
      
 9507 
     | 
    
         
            +
             
     | 
| 
      
 9508 
     | 
    
         
            +
                                Qcur = llm_build_norm(ctx0, Qcur, hparams,
         
     | 
| 
      
 9509 
     | 
    
         
            +
                                            model.layers[il].attn_q_norm,
         
     | 
| 
      
 9510 
     | 
    
         
            +
                                            NULL,
         
     | 
| 
      
 9511 
     | 
    
         
            +
                                            LLM_NORM, cb, il);
         
     | 
| 
      
 9512 
     | 
    
         
            +
                                cb(Qcur, "Qcur", il);
         
     | 
| 
      
 9513 
     | 
    
         
            +
             
     | 
| 
      
 9514 
     | 
    
         
            +
                                Kcur = llm_build_norm(ctx0, Kcur, hparams,
         
     | 
| 
      
 9515 
     | 
    
         
            +
                                        model.layers[il].attn_k_norm,
         
     | 
| 
      
 9516 
     | 
    
         
            +
                                        NULL,
         
     | 
| 
      
 9517 
     | 
    
         
            +
                                        LLM_NORM, cb, il);
         
     | 
| 
      
 9518 
     | 
    
         
            +
                                cb(Kcur, "Kcur", il);
         
     | 
| 
      
 9519 
     | 
    
         
            +
                            }
         
     | 
| 
      
 9520 
     | 
    
         
            +
             
     | 
| 
       8481 
9521 
     | 
    
         
             
                            Qcur = ggml_rope_custom(
         
     | 
| 
       8482 
9522 
     | 
    
         
             
                                ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
         
     | 
| 
       8483 
9523 
     | 
    
         
             
                                n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
         
     | 
| 
         @@ -8497,6 +9537,14 @@ struct llm_build_context { 
     | 
|
| 
       8497 
9537 
     | 
    
         
             
                                    Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
         
     | 
| 
       8498 
9538 
     | 
    
         
             
                        }
         
     | 
| 
       8499 
9539 
     | 
    
         | 
| 
      
 9540 
     | 
    
         
            +
                        if (il == n_layer - 1) {
         
     | 
| 
      
 9541 
     | 
    
         
            +
                            // skip computing output for unused tokens
         
     | 
| 
      
 9542 
     | 
    
         
            +
                            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
         
     | 
| 
      
 9543 
     | 
    
         
            +
                            cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
         
     | 
| 
      
 9544 
     | 
    
         
            +
                            inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
         
     | 
| 
      
 9545 
     | 
    
         
            +
                            ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
         
     | 
| 
      
 9546 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9547 
     | 
    
         
            +
             
     | 
| 
       8500 
9548 
     | 
    
         
             
                        struct ggml_tensor * attn_out = cur;
         
     | 
| 
       8501 
9549 
     | 
    
         | 
| 
       8502 
9550 
     | 
    
         
             
                        // feed-forward network
         
     | 
| 
         @@ -8648,6 +9696,10 @@ static struct ggml_cgraph * llama_build_graph( 
     | 
|
| 
       8648 
9696 
     | 
    
         
             
                        {
         
     | 
| 
       8649 
9697 
     | 
    
         
             
                            result = llm.build_falcon();
         
     | 
| 
       8650 
9698 
     | 
    
         
             
                        } break;
         
     | 
| 
      
 9699 
     | 
    
         
            +
                    case LLM_ARCH_GROK:
         
     | 
| 
      
 9700 
     | 
    
         
            +
                        {
         
     | 
| 
      
 9701 
     | 
    
         
            +
                            result = llm.build_grok();
         
     | 
| 
      
 9702 
     | 
    
         
            +
                        } break;
         
     | 
| 
       8651 
9703 
     | 
    
         
             
                    case LLM_ARCH_STARCODER:
         
     | 
| 
       8652 
9704 
     | 
    
         
             
                        {
         
     | 
| 
       8653 
9705 
     | 
    
         
             
                            result = llm.build_starcoder();
         
     | 
| 
         @@ -8725,6 +9777,10 @@ static struct ggml_cgraph * llama_build_graph( 
     | 
|
| 
       8725 
9777 
     | 
    
         
             
                        {
         
     | 
| 
       8726 
9778 
     | 
    
         
             
                            result = llm.build_mamba();
         
     | 
| 
       8727 
9779 
     | 
    
         
             
                        } break;
         
     | 
| 
      
 9780 
     | 
    
         
            +
                    case LLM_ARCH_XVERSE:
         
     | 
| 
      
 9781 
     | 
    
         
            +
                        {
         
     | 
| 
      
 9782 
     | 
    
         
            +
                            result = llm.build_xverse();
         
     | 
| 
      
 9783 
     | 
    
         
            +
                        } break;
         
     | 
| 
       8728 
9784 
     | 
    
         
             
                    case LLM_ARCH_COMMAND_R:
         
     | 
| 
       8729 
9785 
     | 
    
         
             
                        {
         
     | 
| 
       8730 
9786 
     | 
    
         
             
                            result = llm.build_command_r();
         
     | 
| 
         @@ -8790,9 +9846,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { 
     | 
|
| 
       8790 
9846 
     | 
    
         
             
                    ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
         
     | 
| 
       8791 
9847 
     | 
    
         
             
                }
         
     | 
| 
       8792 
9848 
     | 
    
         | 
| 
      
 9849 
     | 
    
         
            +
                if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
         
     | 
| 
      
 9850 
     | 
    
         
            +
                    GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
         
     | 
| 
      
 9851 
     | 
    
         
            +
                    const int64_t n_tokens = batch.n_tokens;
         
     | 
| 
      
 9852 
     | 
    
         
            +
             
     | 
| 
      
 9853 
     | 
    
         
            +
                    GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
         
     | 
| 
      
 9854 
     | 
    
         
            +
                    int32_t * data = (int32_t *) lctx.inp_out_ids->data;
         
     | 
| 
      
 9855 
     | 
    
         
            +
             
     | 
| 
      
 9856 
     | 
    
         
            +
                    if (lctx.n_outputs == n_tokens) {
         
     | 
| 
      
 9857 
     | 
    
         
            +
                        for (int i = 0; i < n_tokens; ++i) {
         
     | 
| 
      
 9858 
     | 
    
         
            +
                            data[i] = i;
         
     | 
| 
      
 9859 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9860 
     | 
    
         
            +
                    } else if (batch.logits) {
         
     | 
| 
      
 9861 
     | 
    
         
            +
                        int32_t n_outputs = 0;
         
     | 
| 
      
 9862 
     | 
    
         
            +
                        for (int i = 0; i < n_tokens; ++i) {
         
     | 
| 
      
 9863 
     | 
    
         
            +
                            if (batch.logits[i]) {
         
     | 
| 
      
 9864 
     | 
    
         
            +
                                data[n_outputs++] = i;
         
     | 
| 
      
 9865 
     | 
    
         
            +
                            }
         
     | 
| 
      
 9866 
     | 
    
         
            +
                        }
         
     | 
| 
      
 9867 
     | 
    
         
            +
                        // the graph needs to have been passed the correct number of outputs
         
     | 
| 
      
 9868 
     | 
    
         
            +
                        GGML_ASSERT(lctx.n_outputs == n_outputs);
         
     | 
| 
      
 9869 
     | 
    
         
            +
                    } else if (lctx.n_outputs == 1) {
         
     | 
| 
      
 9870 
     | 
    
         
            +
                        // only keep last output
         
     | 
| 
      
 9871 
     | 
    
         
            +
                        data[0] = n_tokens - 1;
         
     | 
| 
      
 9872 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 9873 
     | 
    
         
            +
                        GGML_ASSERT(lctx.n_outputs == 0);
         
     | 
| 
      
 9874 
     | 
    
         
            +
                    }
         
     | 
| 
      
 9875 
     | 
    
         
            +
                }
         
     | 
| 
      
 9876 
     | 
    
         
            +
             
     | 
| 
       8793 
9877 
     | 
    
         
             
                GGML_ASSERT(
         
     | 
| 
      
 9878 
     | 
    
         
            +
                    // (!a || b) is a logical implication (a -> b)
         
     | 
| 
      
 9879 
     | 
    
         
            +
                    // !hparams.causal_attn -> !cparams.causal_attn
         
     | 
| 
       8794 
9880 
     | 
    
         
             
                    (hparams.causal_attn || !cparams.causal_attn) &&
         
     | 
| 
       8795 
     | 
    
         
            -
                    " 
     | 
| 
      
 9881 
     | 
    
         
            +
                    "causal attention with embedding models is not supported"
         
     | 
| 
       8796 
9882 
     | 
    
         
             
                );
         
     | 
| 
       8797 
9883 
     | 
    
         | 
| 
       8798 
9884 
     | 
    
         
             
                if (lctx.inp_KQ_mask) {
         
     | 
| 
         @@ -8971,7 +10057,75 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { 
     | 
|
| 
       8971 
10057 
     | 
    
         
             
                }
         
     | 
| 
       8972 
10058 
     | 
    
         
             
            }
         
     | 
| 
       8973 
10059 
     | 
    
         | 
| 
       8974 
     | 
    
         
            -
             
     | 
| 
      
 10060 
     | 
    
         
            +
            // Make sure enough space is available for outputs.
         
     | 
| 
      
 10061 
     | 
    
         
            +
            // Returns max number of outputs for which space was reserved.
         
     | 
| 
      
 10062 
     | 
    
         
            +
            static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
         
     | 
| 
      
 10063 
     | 
    
         
            +
                const auto & cparams = lctx.cparams;
         
     | 
| 
      
 10064 
     | 
    
         
            +
                const auto & hparams = lctx.model.hparams;
         
     | 
| 
      
 10065 
     | 
    
         
            +
             
     | 
| 
      
 10066 
     | 
    
         
            +
                const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
         
     | 
| 
      
 10067 
     | 
    
         
            +
             
     | 
| 
      
 10068 
     | 
    
         
            +
                const auto n_batch = cparams.n_batch;
         
     | 
| 
      
 10069 
     | 
    
         
            +
                const auto n_vocab = hparams.n_vocab;
         
     | 
| 
      
 10070 
     | 
    
         
            +
                const auto n_embd  = hparams.n_embd;
         
     | 
| 
      
 10071 
     | 
    
         
            +
             
     | 
| 
      
 10072 
     | 
    
         
            +
                // TODO: use a per-batch flag for logits presence instead
         
     | 
| 
      
 10073 
     | 
    
         
            +
                const bool has_logits = cparams.causal_attn;
         
     | 
| 
      
 10074 
     | 
    
         
            +
                const bool has_embd   = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
         
     | 
| 
      
 10075 
     | 
    
         
            +
             
     | 
| 
      
 10076 
     | 
    
         
            +
                const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
         
     | 
| 
      
 10077 
     | 
    
         
            +
                const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
         
     | 
| 
      
 10078 
     | 
    
         
            +
             
     | 
| 
      
 10079 
     | 
    
         
            +
                if (lctx.output_ids.empty()) {
         
     | 
| 
      
 10080 
     | 
    
         
            +
                    // init, never resized afterwards
         
     | 
| 
      
 10081 
     | 
    
         
            +
                    lctx.output_ids.resize(n_batch);
         
     | 
| 
      
 10082 
     | 
    
         
            +
                }
         
     | 
| 
      
 10083 
     | 
    
         
            +
             
     | 
| 
      
 10084 
     | 
    
         
            +
                const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
         
     | 
| 
      
 10085 
     | 
    
         
            +
                const size_t new_size  = (logits_size + embd_size) * sizeof(float);
         
     | 
| 
      
 10086 
     | 
    
         
            +
             
     | 
| 
      
 10087 
     | 
    
         
            +
                // alloc only when more than the current capacity is required
         
     | 
| 
      
 10088 
     | 
    
         
            +
                // TODO: also consider shrinking the buffer
         
     | 
| 
      
 10089 
     | 
    
         
            +
                if (!lctx.buf_output || prev_size < new_size) {
         
     | 
| 
      
 10090 
     | 
    
         
            +
                    if (lctx.buf_output) {
         
     | 
| 
      
 10091 
     | 
    
         
            +
            #ifndef NDEBUG
         
     | 
| 
      
 10092 
     | 
    
         
            +
                        // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
         
     | 
| 
      
 10093 
     | 
    
         
            +
                        LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
         
     | 
| 
      
 10094 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 10095 
     | 
    
         
            +
                        ggml_backend_buffer_free(lctx.buf_output);
         
     | 
| 
      
 10096 
     | 
    
         
            +
                        lctx.buf_output = nullptr;
         
     | 
| 
      
 10097 
     | 
    
         
            +
                        lctx.logits = nullptr;
         
     | 
| 
      
 10098 
     | 
    
         
            +
                        lctx.embd = nullptr;
         
     | 
| 
      
 10099 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10100 
     | 
    
         
            +
             
     | 
| 
      
 10101 
     | 
    
         
            +
                    lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
         
     | 
| 
      
 10102 
     | 
    
         
            +
                    if (lctx.buf_output == nullptr) {
         
     | 
| 
      
 10103 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
         
     | 
| 
      
 10104 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 10105 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10106 
     | 
    
         
            +
                }
         
     | 
| 
      
 10107 
     | 
    
         
            +
             
     | 
| 
      
 10108 
     | 
    
         
            +
                float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
         
     | 
| 
      
 10109 
     | 
    
         
            +
             
     | 
| 
      
 10110 
     | 
    
         
            +
                lctx.logits = has_logits ? output_base               : nullptr;
         
     | 
| 
      
 10111 
     | 
    
         
            +
                lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
         
     | 
| 
      
 10112 
     | 
    
         
            +
             
     | 
| 
      
 10113 
     | 
    
         
            +
                lctx.output_size = n_outputs_max;
         
     | 
| 
      
 10114 
     | 
    
         
            +
                lctx.logits_size = logits_size;
         
     | 
| 
      
 10115 
     | 
    
         
            +
                lctx.embd_size   = embd_size;
         
     | 
| 
      
 10116 
     | 
    
         
            +
             
     | 
| 
      
 10117 
     | 
    
         
            +
                // set all ids as invalid (negative)
         
     | 
| 
      
 10118 
     | 
    
         
            +
                std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
         
     | 
| 
      
 10119 
     | 
    
         
            +
             
     | 
| 
      
 10120 
     | 
    
         
            +
                ggml_backend_buffer_clear(lctx.buf_output, 0);
         
     | 
| 
      
 10121 
     | 
    
         
            +
             
     | 
| 
      
 10122 
     | 
    
         
            +
                lctx.n_outputs = 0;
         
     | 
| 
      
 10123 
     | 
    
         
            +
             
     | 
| 
      
 10124 
     | 
    
         
            +
                return n_outputs_max;
         
     | 
| 
      
 10125 
     | 
    
         
            +
            }
         
     | 
| 
      
 10126 
     | 
    
         
            +
             
     | 
| 
      
 10127 
     | 
    
         
            +
             
     | 
| 
      
 10128 
     | 
    
         
            +
            static void llama_graph_compute(
         
     | 
| 
       8975 
10129 
     | 
    
         
             
                    llama_context & lctx,
         
     | 
| 
       8976 
10130 
     | 
    
         
             
                      ggml_cgraph * gf,
         
     | 
| 
       8977 
10131 
     | 
    
         
             
                              int   n_threads) {
         
     | 
| 
         @@ -9046,16 +10200,8 @@ static int llama_decode_internal( 
     | 
|
| 
       9046 
10200 
     | 
    
         
             
                const int64_t n_embd  = hparams.n_embd;
         
     | 
| 
       9047 
10201 
     | 
    
         
             
                const int64_t n_vocab = hparams.n_vocab;
         
     | 
| 
       9048 
10202 
     | 
    
         | 
| 
       9049 
     | 
    
         
            -
             
     | 
| 
       9050 
     | 
    
         
            -
                 
     | 
| 
       9051 
     | 
    
         
            -
             
     | 
| 
       9052 
     | 
    
         
            -
            #ifndef NDEBUG
         
     | 
| 
       9053 
     | 
    
         
            -
                auto & logits_valid = lctx.logits_valid;
         
     | 
| 
       9054 
     | 
    
         
            -
                logits_valid.clear();
         
     | 
| 
       9055 
     | 
    
         
            -
                logits_valid.resize(n_tokens_all);
         
     | 
| 
       9056 
     | 
    
         
            -
             
     | 
| 
       9057 
     | 
    
         
            -
                memset(logits_out, 0, lctx.logits_size*sizeof(float));
         
     | 
| 
       9058 
     | 
    
         
            -
            #endif
         
     | 
| 
      
 10203 
     | 
    
         
            +
                uint32_t n_outputs = 0;
         
     | 
| 
      
 10204 
     | 
    
         
            +
                uint32_t n_outputs_prev = 0;
         
     | 
| 
       9059 
10205 
     | 
    
         | 
| 
       9060 
10206 
     | 
    
         
             
                const auto n_ubatch = cparams.n_ubatch;
         
     | 
| 
       9061 
10207 
     | 
    
         | 
| 
         @@ -9064,6 +10210,38 @@ static int llama_decode_internal( 
     | 
|
| 
       9064 
10210 
     | 
    
         
             
                std::vector<llama_seq_id *>            seq_id_arr;
         
     | 
| 
       9065 
10211 
     | 
    
         
             
                std::vector<std::vector<llama_seq_id>> seq_id;
         
     | 
| 
       9066 
10212 
     | 
    
         | 
| 
      
 10213 
     | 
    
         
            +
                // count outputs
         
     | 
| 
      
 10214 
     | 
    
         
            +
                if (batch_all.logits) {
         
     | 
| 
      
 10215 
     | 
    
         
            +
                    for (uint32_t i = 0; i < n_tokens_all; ++i) {
         
     | 
| 
      
 10216 
     | 
    
         
            +
                        n_outputs += batch_all.logits[i] != 0;
         
     | 
| 
      
 10217 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10218 
     | 
    
         
            +
                } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
         
     | 
| 
      
 10219 
     | 
    
         
            +
                    n_outputs = n_tokens_all;
         
     | 
| 
      
 10220 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 10221 
     | 
    
         
            +
                    // keep last output only
         
     | 
| 
      
 10222 
     | 
    
         
            +
                    n_outputs = 1;
         
     | 
| 
      
 10223 
     | 
    
         
            +
                }
         
     | 
| 
      
 10224 
     | 
    
         
            +
             
     | 
| 
      
 10225 
     | 
    
         
            +
                // reserve output buffer
         
     | 
| 
      
 10226 
     | 
    
         
            +
                if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
         
     | 
| 
      
 10227 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
         
     | 
| 
      
 10228 
     | 
    
         
            +
                    return -2;
         
     | 
| 
      
 10229 
     | 
    
         
            +
                };
         
     | 
| 
      
 10230 
     | 
    
         
            +
             
     | 
| 
      
 10231 
     | 
    
         
            +
                // set output mappings
         
     | 
| 
      
 10232 
     | 
    
         
            +
                if (batch_all.logits) {
         
     | 
| 
      
 10233 
     | 
    
         
            +
                    int32_t i_logits = 0;
         
     | 
| 
      
 10234 
     | 
    
         
            +
                    for (uint32_t i = 0; i < n_tokens_all; ++i) {
         
     | 
| 
      
 10235 
     | 
    
         
            +
                        if (batch_all.logits[i]) {
         
     | 
| 
      
 10236 
     | 
    
         
            +
                            lctx.output_ids[i] = i_logits++;
         
     | 
| 
      
 10237 
     | 
    
         
            +
                        }
         
     | 
| 
      
 10238 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10239 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 10240 
     | 
    
         
            +
                    for (uint32_t i = 0; i < n_outputs; ++i) {
         
     | 
| 
      
 10241 
     | 
    
         
            +
                        lctx.output_ids[i] = i;
         
     | 
| 
      
 10242 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10243 
     | 
    
         
            +
                }
         
     | 
| 
      
 10244 
     | 
    
         
            +
             
     | 
| 
       9067 
10245 
     | 
    
         
             
                for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
         
     | 
| 
       9068 
10246 
     | 
    
         
             
                    const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
         
     | 
| 
       9069 
10247 
     | 
    
         
             
                    llama_batch u_batch = {
         
     | 
| 
         @@ -9079,6 +10257,27 @@ static int llama_decode_internal( 
     | 
|
| 
       9079 
10257 
     | 
    
         
             
                        /* .all_seq_id = */ batch_all.all_seq_id,
         
     | 
| 
       9080 
10258 
     | 
    
         
             
                    };
         
     | 
| 
       9081 
10259 
     | 
    
         | 
| 
      
 10260 
     | 
    
         
            +
                    // count the outputs in this u_batch
         
     | 
| 
      
 10261 
     | 
    
         
            +
                    {
         
     | 
| 
      
 10262 
     | 
    
         
            +
                        int32_t n_outputs_new = 0;
         
     | 
| 
      
 10263 
     | 
    
         
            +
             
     | 
| 
      
 10264 
     | 
    
         
            +
                        if (u_batch.logits) {
         
     | 
| 
      
 10265 
     | 
    
         
            +
                            for (uint32_t i = 0; i < n_tokens; i++) {
         
     | 
| 
      
 10266 
     | 
    
         
            +
                                n_outputs_new += u_batch.logits[i] != 0;
         
     | 
| 
      
 10267 
     | 
    
         
            +
                            }
         
     | 
| 
      
 10268 
     | 
    
         
            +
                        } else if (n_outputs == n_tokens_all) {
         
     | 
| 
      
 10269 
     | 
    
         
            +
                            n_outputs_new = n_tokens;
         
     | 
| 
      
 10270 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 10271 
     | 
    
         
            +
                            // keep last output only
         
     | 
| 
      
 10272 
     | 
    
         
            +
                            if (cur_token + n_tokens >= n_tokens_all) {
         
     | 
| 
      
 10273 
     | 
    
         
            +
                                n_outputs_new = 1;
         
     | 
| 
      
 10274 
     | 
    
         
            +
                            }
         
     | 
| 
      
 10275 
     | 
    
         
            +
                        }
         
     | 
| 
      
 10276 
     | 
    
         
            +
             
     | 
| 
      
 10277 
     | 
    
         
            +
                        // needs to happen before the graph is built
         
     | 
| 
      
 10278 
     | 
    
         
            +
                        lctx.n_outputs = n_outputs_new;
         
     | 
| 
      
 10279 
     | 
    
         
            +
                    }
         
     | 
| 
      
 10280 
     | 
    
         
            +
             
     | 
| 
       9082 
10281 
     | 
    
         
             
                    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
         
     | 
| 
       9083 
10282 
     | 
    
         
             
                    GGML_ASSERT(n_threads > 0);
         
     | 
| 
       9084 
10283 
     | 
    
         | 
| 
         @@ -9142,23 +10341,37 @@ static int llama_decode_internal( 
     | 
|
| 
       9142 
10341 
     | 
    
         
             
                    struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
         
     | 
| 
       9143 
10342 
     | 
    
         
             
                    struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
         
     | 
| 
       9144 
10343 
     | 
    
         | 
| 
       9145 
     | 
    
         
            -
                    if ( 
     | 
| 
      
 10344 
     | 
    
         
            +
                    if (lctx.n_outputs == 0) {
         
     | 
| 
      
 10345 
     | 
    
         
            +
                        // no output
         
     | 
| 
      
 10346 
     | 
    
         
            +
                        res  = nullptr;
         
     | 
| 
      
 10347 
     | 
    
         
            +
                        embd = nullptr;
         
     | 
| 
      
 10348 
     | 
    
         
            +
                    } else if (!hparams.causal_attn) {
         
     | 
| 
       9146 
10349 
     | 
    
         
             
                        res = nullptr; // do not extract logits for embedding models such as BERT
         
     | 
| 
       9147 
10350 
     | 
    
         | 
| 
       9148 
10351 
     | 
    
         
             
                        // token or sequence embeddings
         
     | 
| 
       9149 
10352 
     | 
    
         
             
                        embd = gf->nodes[gf->n_nodes - 1];
         
     | 
| 
       9150 
10353 
     | 
    
         | 
| 
       9151 
10354 
     | 
    
         
             
                        GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
         
     | 
| 
       9152 
     | 
    
         
            -
                    } else {
         
     | 
| 
       9153 
     | 
    
         
            -
                         
     | 
| 
       9154 
     | 
    
         
            -
             
     | 
| 
       9155 
     | 
    
         
            -
             
     | 
| 
       9156 
     | 
    
         
            -
             
     | 
| 
       9157 
     | 
    
         
            -
             
     | 
| 
       9158 
     | 
    
         
            -
                             
     | 
| 
       9159 
     | 
    
         
            -
                        } else {
         
     | 
| 
       9160 
     | 
    
         
            -
                            GGML_ASSERT(false && "missing result_output tensor");
         
     | 
| 
      
 10355 
     | 
    
         
            +
                    } else if (cparams.embeddings) {
         
     | 
| 
      
 10356 
     | 
    
         
            +
                        // the embeddings could be in the second to last tensor, or any of the previous tensors
         
     | 
| 
      
 10357 
     | 
    
         
            +
                        int i_embd = gf->n_nodes - 2;
         
     | 
| 
      
 10358 
     | 
    
         
            +
                        for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
         
     | 
| 
      
 10359 
     | 
    
         
            +
                            i_embd = gf->n_nodes - i;
         
     | 
| 
      
 10360 
     | 
    
         
            +
                            if (i_embd < 0) { break; }
         
     | 
| 
      
 10361 
     | 
    
         
            +
                            embd = gf->nodes[i_embd];
         
     | 
| 
       9161 
10362 
     | 
    
         
             
                        }
         
     | 
| 
      
 10363 
     | 
    
         
            +
                        GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
         
     | 
| 
      
 10364 
     | 
    
         
            +
             
     | 
| 
      
 10365 
     | 
    
         
            +
                        // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
         
     | 
| 
      
 10366 
     | 
    
         
            +
                        if (!cparams.causal_attn) {
         
     | 
| 
      
 10367 
     | 
    
         
            +
                            res = nullptr; // do not extract logits when not needed
         
     | 
| 
      
 10368 
     | 
    
         
            +
                            // skip computing logits
         
     | 
| 
      
 10369 
     | 
    
         
            +
                            // TODO: is this safe?
         
     | 
| 
      
 10370 
     | 
    
         
            +
                            gf->n_nodes = i_embd + 1;
         
     | 
| 
      
 10371 
     | 
    
         
            +
                        }
         
     | 
| 
      
 10372 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 10373 
     | 
    
         
            +
                        embd = nullptr; // do not extract embeddings when not needed
         
     | 
| 
      
 10374 
     | 
    
         
            +
                        GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         
     | 
| 
       9162 
10375 
     | 
    
         
             
                    }
         
     | 
| 
       9163 
10376 
     | 
    
         
             
                    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
         
     | 
| 
       9164 
10377 
     | 
    
         | 
| 
         @@ -9201,50 +10414,23 @@ static int llama_decode_internal( 
     | 
|
| 
       9201 
10414 
     | 
    
         
             
                    //}
         
     | 
| 
       9202 
10415 
     | 
    
         | 
| 
       9203 
10416 
     | 
    
         
             
                    // extract logits
         
     | 
| 
       9204 
     | 
    
         
            -
                    // TODO: do not compute and extract logits if only embeddings are needed
         
     | 
| 
       9205 
     | 
    
         
            -
                    //       update the graphs to skip "result_output" if logits are not needed
         
     | 
| 
       9206 
10417 
     | 
    
         
             
                    if (res) {
         
     | 
| 
       9207 
10418 
     | 
    
         
             
                        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
         
     | 
| 
       9208 
10419 
     | 
    
         
             
                        GGML_ASSERT(backend_res != nullptr);
         
     | 
| 
       9209 
     | 
    
         
            -
                         
     | 
| 
       9210 
     | 
    
         
            -
             
     | 
| 
       9211 
     | 
    
         
            -
             
     | 
| 
       9212 
     | 
    
         
            -
             
     | 
| 
       9213 
     | 
    
         
            -
             
     | 
| 
       9214 
     | 
    
         
            -
             
     | 
| 
       9215 
     | 
    
         
            -
             
     | 
| 
       9216 
     | 
    
         
            -
             
     | 
| 
       9217 
     | 
    
         
            -
             
     | 
| 
       9218 
     | 
    
         
            -
                                        // extract logits for the range [i_first, i_last)
         
     | 
| 
       9219 
     | 
    
         
            -
                                        // group the requests to minimize the number of calls to the backend
         
     | 
| 
       9220 
     | 
    
         
            -
                                        ggml_backend_tensor_get_async(backend_res, res,
         
     | 
| 
       9221 
     | 
    
         
            -
                                            logits_out + n_vocab*(cur_token + i_first),
         
     | 
| 
       9222 
     | 
    
         
            -
                                            i_first*n_vocab*sizeof(float),
         
     | 
| 
       9223 
     | 
    
         
            -
                                            (i_last - i_first)*n_vocab*sizeof(float));
         
     | 
| 
       9224 
     | 
    
         
            -
                                        i_first = -1;
         
     | 
| 
       9225 
     | 
    
         
            -
                                    }
         
     | 
| 
       9226 
     | 
    
         
            -
                                }
         
     | 
| 
       9227 
     | 
    
         
            -
            #ifndef NDEBUG
         
     | 
| 
       9228 
     | 
    
         
            -
                                logits_valid[cur_token + i] = u_batch.logits[i] != 0;;
         
     | 
| 
       9229 
     | 
    
         
            -
            #endif
         
     | 
| 
       9230 
     | 
    
         
            -
                            }
         
     | 
| 
       9231 
     | 
    
         
            -
                        } else if (lctx.logits_all) {
         
     | 
| 
       9232 
     | 
    
         
            -
                            ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float));
         
     | 
| 
       9233 
     | 
    
         
            -
            #ifndef NDEBUG
         
     | 
| 
       9234 
     | 
    
         
            -
                            std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true);
         
     | 
| 
       9235 
     | 
    
         
            -
            #endif
         
     | 
| 
       9236 
     | 
    
         
            -
                        } else {
         
     | 
| 
       9237 
     | 
    
         
            -
                            if (cur_token + n_tokens >= n_tokens_all) {
         
     | 
| 
       9238 
     | 
    
         
            -
                                ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
         
     | 
| 
       9239 
     | 
    
         
            -
            #ifndef NDEBUG
         
     | 
| 
       9240 
     | 
    
         
            -
                                logits_valid[0] = true;
         
     | 
| 
       9241 
     | 
    
         
            -
            #endif
         
     | 
| 
       9242 
     | 
    
         
            -
                            }
         
     | 
| 
      
 10420 
     | 
    
         
            +
                        GGML_ASSERT(lctx.logits != nullptr);
         
     | 
| 
      
 10421 
     | 
    
         
            +
             
     | 
| 
      
 10422 
     | 
    
         
            +
                        float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
         
     | 
| 
      
 10423 
     | 
    
         
            +
                        const int32_t n_outputs_new = lctx.n_outputs;
         
     | 
| 
      
 10424 
     | 
    
         
            +
             
     | 
| 
      
 10425 
     | 
    
         
            +
                        if (n_outputs_new) {
         
     | 
| 
      
 10426 
     | 
    
         
            +
                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
         
     | 
| 
      
 10427 
     | 
    
         
            +
                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
         
     | 
| 
      
 10428 
     | 
    
         
            +
                            ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
         
     | 
| 
       9243 
10429 
     | 
    
         
             
                        }
         
     | 
| 
       9244 
10430 
     | 
    
         
             
                    }
         
     | 
| 
       9245 
10431 
     | 
    
         | 
| 
       9246 
10432 
     | 
    
         
             
                    // extract embeddings
         
     | 
| 
       9247 
     | 
    
         
            -
                    if ( 
     | 
| 
      
 10433 
     | 
    
         
            +
                    if (embd) {
         
     | 
| 
       9248 
10434 
     | 
    
         
             
                        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
         
     | 
| 
       9249 
10435 
     | 
    
         
             
                        GGML_ASSERT(backend_embd != nullptr);
         
     | 
| 
       9250 
10436 
     | 
    
         | 
| 
         @@ -9252,16 +10438,14 @@ static int llama_decode_internal( 
     | 
|
| 
       9252 
10438 
     | 
    
         
             
                            case LLAMA_POOLING_TYPE_NONE:
         
     | 
| 
       9253 
10439 
     | 
    
         
             
                                {
         
     | 
| 
       9254 
10440 
     | 
    
         
             
                                    // extract token embeddings
         
     | 
| 
       9255 
     | 
    
         
            -
                                     
     | 
| 
       9256 
     | 
    
         
            -
             
     | 
| 
       9257 
     | 
    
         
            -
                                     
     | 
| 
       9258 
     | 
    
         
            -
             
     | 
| 
       9259 
     | 
    
         
            -
             
     | 
| 
       9260 
     | 
    
         
            -
             
     | 
| 
       9261 
     | 
    
         
            -
             
     | 
| 
       9262 
     | 
    
         
            -
             
     | 
| 
       9263 
     | 
    
         
            -
                                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
         
     | 
| 
       9264 
     | 
    
         
            -
                                        }
         
     | 
| 
      
 10441 
     | 
    
         
            +
                                    GGML_ASSERT(lctx.embd != nullptr);
         
     | 
| 
      
 10442 
     | 
    
         
            +
                                    float * embd_out = lctx.embd + n_outputs_prev*n_embd;
         
     | 
| 
      
 10443 
     | 
    
         
            +
                                    const int32_t n_outputs_new = lctx.n_outputs;
         
     | 
| 
      
 10444 
     | 
    
         
            +
             
     | 
| 
      
 10445 
     | 
    
         
            +
                                    if (n_outputs_new) {
         
     | 
| 
      
 10446 
     | 
    
         
            +
                                        GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
         
     | 
| 
      
 10447 
     | 
    
         
            +
                                        GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
         
     | 
| 
      
 10448 
     | 
    
         
            +
                                        ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
         
     | 
| 
       9265 
10449 
     | 
    
         
             
                                    }
         
     | 
| 
       9266 
10450 
     | 
    
         
             
                                } break;
         
     | 
| 
       9267 
10451 
     | 
    
         
             
                            case LLAMA_POOLING_TYPE_CLS:
         
     | 
| 
         @@ -9288,8 +10472,12 @@ static int llama_decode_internal( 
     | 
|
| 
       9288 
10472 
     | 
    
         
             
                                } break;
         
     | 
| 
       9289 
10473 
     | 
    
         
             
                        }
         
     | 
| 
       9290 
10474 
     | 
    
         
             
                    }
         
     | 
| 
      
 10475 
     | 
    
         
            +
                    n_outputs_prev += lctx.n_outputs;
         
     | 
| 
       9291 
10476 
     | 
    
         
             
                }
         
     | 
| 
       9292 
10477 
     | 
    
         | 
| 
      
 10478 
     | 
    
         
            +
                // set to total number of outputs in the batch, for use in llama_get_logits_ith
         
     | 
| 
      
 10479 
     | 
    
         
            +
                lctx.n_outputs = n_outputs;
         
     | 
| 
      
 10480 
     | 
    
         
            +
             
     | 
| 
       9293 
10481 
     | 
    
         
             
                // wait for the computation to finish (automatically done when obtaining the model output)
         
     | 
| 
       9294 
10482 
     | 
    
         
             
                //llama_synchronize(&lctx);
         
     | 
| 
       9295 
10483 
     | 
    
         | 
| 
         @@ -9933,7 +11121,7 @@ struct llm_tokenizer_bpe { 
     | 
|
| 
       9933 
11121 
     | 
    
         
             
                            add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
         
     | 
| 
       9934 
11122 
     | 
    
         
             
                        }
         
     | 
| 
       9935 
11123 
     | 
    
         | 
| 
       9936 
     | 
    
         
            -
                        // add the  
     | 
| 
      
 11124 
     | 
    
         
            +
                        // add the finished tokens to the final list keeping correct order for next and prev
         
     | 
| 
       9937 
11125 
     | 
    
         
             
                        for (auto & sym : symbols) {
         
     | 
| 
       9938 
11126 
     | 
    
         
             
                            if (sym.n > 0) {
         
     | 
| 
       9939 
11127 
     | 
    
         
             
                                sym.prev = final_prev_index;
         
     | 
| 
         @@ -10202,9 +11390,6 @@ struct llm_tokenizer_wpm { 
     | 
|
| 
       10202 
11390 
     | 
    
         
             
                            output.push_back(vocab.special_unk_id);
         
     | 
| 
       10203 
11391 
     | 
    
         
             
                        }
         
     | 
| 
       10204 
11392 
     | 
    
         
             
                    }
         
     | 
| 
       10205 
     | 
    
         
            -
             
     | 
| 
       10206 
     | 
    
         
            -
                    // append eos token
         
     | 
| 
       10207 
     | 
    
         
            -
                    output.push_back(vocab.special_eos_id);
         
     | 
| 
       10208 
11393 
     | 
    
         
             
                }
         
     | 
| 
       10209 
11394 
     | 
    
         | 
| 
       10210 
11395 
     | 
    
         
             
                std::vector<std::string> preprocess(const std::string & text) {
         
     | 
| 
         @@ -10218,7 +11403,7 @@ struct llm_tokenizer_wpm { 
     | 
|
| 
       10218 
11403 
     | 
    
         
             
                        if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
         
     | 
| 
       10219 
11404 
     | 
    
         
             
                            continue;
         
     | 
| 
       10220 
11405 
     | 
    
         
             
                        }
         
     | 
| 
       10221 
     | 
    
         
            -
                        code =  
     | 
| 
      
 11406 
     | 
    
         
            +
                        code = unicode_tolower(code);
         
     | 
| 
       10222 
11407 
     | 
    
         
             
                        if (type == CODEPOINT_TYPE_WHITESPACE) {
         
     | 
| 
       10223 
11408 
     | 
    
         
             
                            code = ' ';
         
     | 
| 
       10224 
11409 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -10238,7 +11423,7 @@ struct llm_tokenizer_wpm { 
     | 
|
| 
       10238 
11423 
     | 
    
         
             
                    std::vector<std::string> words;
         
     | 
| 
       10239 
11424 
     | 
    
         
             
                    while (r < new_str.size()) {
         
     | 
| 
       10240 
11425 
     | 
    
         
             
                        // if is whitespace
         
     | 
| 
       10241 
     | 
    
         
            -
                        if (isspace(new_str[r])) {
         
     | 
| 
      
 11426 
     | 
    
         
            +
                        if (isspace(new_str[r], std::locale::classic())) {
         
     | 
| 
       10242 
11427 
     | 
    
         
             
                            if (r > l) words.push_back(new_str.substr(l, (r - l)));
         
     | 
| 
       10243 
11428 
     | 
    
         
             
                            l = r + 1;
         
     | 
| 
       10244 
11429 
     | 
    
         
             
                            r = l;
         
     | 
| 
         @@ -10252,18 +11437,12 @@ struct llm_tokenizer_wpm { 
     | 
|
| 
       10252 
11437 
     | 
    
         
             
                    return words;
         
     | 
| 
       10253 
11438 
     | 
    
         
             
                }
         
     | 
| 
       10254 
11439 
     | 
    
         | 
| 
       10255 
     | 
    
         
            -
                uint32_t to_lower(uint32_t code) {
         
     | 
| 
       10256 
     | 
    
         
            -
                    static const std::locale locale("en_US.UTF-8");
         
     | 
| 
       10257 
     | 
    
         
            -
            #if defined(_WIN32)
         
     | 
| 
       10258 
     | 
    
         
            -
                    if (code > 0xFFFF) {
         
     | 
| 
       10259 
     | 
    
         
            -
                        return code;
         
     | 
| 
       10260 
     | 
    
         
            -
                    }
         
     | 
| 
       10261 
     | 
    
         
            -
            #endif
         
     | 
| 
       10262 
     | 
    
         
            -
                    return std::tolower(wchar_t(code), locale);
         
     | 
| 
       10263 
     | 
    
         
            -
                }
         
     | 
| 
       10264 
     | 
    
         
            -
             
     | 
| 
       10265 
11440 
     | 
    
         
             
                bool is_ascii_punct(uint32_t code) {
         
     | 
| 
       10266 
     | 
    
         
            -
                     
     | 
| 
      
 11441 
     | 
    
         
            +
                    if (code > 0xFF) {
         
     | 
| 
      
 11442 
     | 
    
         
            +
                        return false;
         
     | 
| 
      
 11443 
     | 
    
         
            +
                    }
         
     | 
| 
      
 11444 
     | 
    
         
            +
                    auto c = char(static_cast<unsigned char>(code));
         
     | 
| 
      
 11445 
     | 
    
         
            +
                    return ispunct(c, std::locale::classic());
         
     | 
| 
       10267 
11446 
     | 
    
         
             
                }
         
     | 
| 
       10268 
11447 
     | 
    
         | 
| 
       10269 
11448 
     | 
    
         
             
                bool is_chinese_char(uint32_t cpt) {
         
     | 
| 
         @@ -10415,30 +11594,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< 
     | 
|
| 
       10415 
11594 
     | 
    
         
             
                }
         
     | 
| 
       10416 
11595 
     | 
    
         
             
            }
         
     | 
| 
       10417 
11596 
     | 
    
         | 
| 
       10418 
     | 
    
         
            -
            static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool  
     | 
| 
      
 11597 
     | 
    
         
            +
            static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
         
     | 
| 
       10419 
11598 
     | 
    
         
             
                std::vector<llama_vocab::id> output;
         
     | 
| 
       10420 
     | 
    
         
            -
             
     | 
| 
       10421 
     | 
    
         
            -
                // OG tokenizer behavior:
         
     | 
| 
       10422 
     | 
    
         
            -
                //
         
     | 
| 
       10423 
     | 
    
         
            -
                // tokenizer.encode('', add_bos=True)  returns [1]
         
     | 
| 
       10424 
     | 
    
         
            -
                // tokenizer.encode('', add_bos=False) returns []
         
     | 
| 
       10425 
     | 
    
         
            -
             
     | 
| 
       10426 
     | 
    
         
            -
                if (bos && vocab.special_bos_id != -1) {
         
     | 
| 
       10427 
     | 
    
         
            -
                    output.push_back(vocab.special_bos_id);
         
     | 
| 
       10428 
     | 
    
         
            -
                }
         
     | 
| 
       10429 
     | 
    
         
            -
             
     | 
| 
       10430 
     | 
    
         
            -
                if (raw_text.empty()) {
         
     | 
| 
       10431 
     | 
    
         
            -
                    return output;
         
     | 
| 
       10432 
     | 
    
         
            -
                }
         
     | 
| 
       10433 
     | 
    
         
            -
             
     | 
| 
       10434 
11599 
     | 
    
         
             
                std::forward_list<fragment_buffer_variant> fragment_buffer;
         
     | 
| 
       10435 
     | 
    
         
            -
                fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
         
     | 
| 
       10436 
11600 
     | 
    
         | 
| 
       10437 
     | 
    
         
            -
                if ( 
     | 
| 
      
 11601 
     | 
    
         
            +
                if (!raw_text.empty()) {
         
     | 
| 
      
 11602 
     | 
    
         
            +
                    fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
         
     | 
| 
      
 11603 
     | 
    
         
            +
                    if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
         
     | 
| 
      
 11604 
     | 
    
         
            +
                }
         
     | 
| 
       10438 
11605 
     | 
    
         | 
| 
       10439 
11606 
     | 
    
         
             
                switch (vocab.type) {
         
     | 
| 
       10440 
11607 
     | 
    
         
             
                    case LLAMA_VOCAB_TYPE_SPM:
         
     | 
| 
       10441 
11608 
     | 
    
         
             
                        {
         
     | 
| 
      
 11609 
     | 
    
         
            +
                            // OG tokenizer behavior:
         
     | 
| 
      
 11610 
     | 
    
         
            +
                            //
         
     | 
| 
      
 11611 
     | 
    
         
            +
                            // tokenizer.encode('', add_special_tokens=True)  returns [1]
         
     | 
| 
      
 11612 
     | 
    
         
            +
                            // tokenizer.encode('', add_special_tokens=False) returns []
         
     | 
| 
      
 11613 
     | 
    
         
            +
             
     | 
| 
      
 11614 
     | 
    
         
            +
                            if (add_special && vocab.special_add_bos != 0) {
         
     | 
| 
      
 11615 
     | 
    
         
            +
                                GGML_ASSERT(vocab.special_bos_id != -1);
         
     | 
| 
      
 11616 
     | 
    
         
            +
                                output.push_back(vocab.special_bos_id);
         
     | 
| 
      
 11617 
     | 
    
         
            +
                            }
         
     | 
| 
      
 11618 
     | 
    
         
            +
             
     | 
| 
       10442 
11619 
     | 
    
         
             
                            for (const auto & fragment : fragment_buffer) {
         
     | 
| 
       10443 
11620 
     | 
    
         
             
                                if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
         
     | 
| 
       10444 
11621 
     | 
    
         
             
                                    // without adding this leading whitespace, we do not get the same results as the original tokenizer
         
     | 
| 
         @@ -10464,9 +11641,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & 
     | 
|
| 
       10464 
11641 
     | 
    
         
             
                                    output.push_back(fragment.token);
         
     | 
| 
       10465 
11642 
     | 
    
         
             
                                }
         
     | 
| 
       10466 
11643 
     | 
    
         
             
                            }
         
     | 
| 
      
 11644 
     | 
    
         
            +
             
     | 
| 
      
 11645 
     | 
    
         
            +
                            if (add_special && vocab.special_add_eos == 1) {
         
     | 
| 
      
 11646 
     | 
    
         
            +
                                GGML_ASSERT(vocab.special_eos_id != -1);
         
     | 
| 
      
 11647 
     | 
    
         
            +
                                output.push_back(vocab.special_eos_id);
         
     | 
| 
      
 11648 
     | 
    
         
            +
                            }
         
     | 
| 
       10467 
11649 
     | 
    
         
             
                        } break;
         
     | 
| 
       10468 
11650 
     | 
    
         
             
                    case LLAMA_VOCAB_TYPE_BPE:
         
     | 
| 
       10469 
11651 
     | 
    
         
             
                        {
         
     | 
| 
      
 11652 
     | 
    
         
            +
                            if (add_special && vocab.special_add_bos == 1) {
         
     | 
| 
      
 11653 
     | 
    
         
            +
                                GGML_ASSERT(vocab.special_bos_id != -1);
         
     | 
| 
      
 11654 
     | 
    
         
            +
                                output.push_back(vocab.special_bos_id);
         
     | 
| 
      
 11655 
     | 
    
         
            +
                            }
         
     | 
| 
      
 11656 
     | 
    
         
            +
             
     | 
| 
       10470 
11657 
     | 
    
         
             
                            for (const auto & fragment : fragment_buffer) {
         
     | 
| 
       10471 
11658 
     | 
    
         
             
                                if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
         
     | 
| 
       10472 
11659 
     | 
    
         
             
                                    auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
         
     | 
| 
         @@ -10480,9 +11667,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & 
     | 
|
| 
       10480 
11667 
     | 
    
         
             
                                    output.push_back(fragment.token);
         
     | 
| 
       10481 
11668 
     | 
    
         
             
                                }
         
     | 
| 
       10482 
11669 
     | 
    
         
             
                            }
         
     | 
| 
      
 11670 
     | 
    
         
            +
             
     | 
| 
      
 11671 
     | 
    
         
            +
                            GGML_ASSERT(vocab.special_add_eos != 1);
         
     | 
| 
       10483 
11672 
     | 
    
         
             
                        } break;
         
     | 
| 
       10484 
11673 
     | 
    
         
             
                    case LLAMA_VOCAB_TYPE_WPM:
         
     | 
| 
       10485 
11674 
     | 
    
         
             
                        {
         
     | 
| 
      
 11675 
     | 
    
         
            +
                            if (add_special) {
         
     | 
| 
      
 11676 
     | 
    
         
            +
                                GGML_ASSERT(vocab.special_cls_id != -1);
         
     | 
| 
      
 11677 
     | 
    
         
            +
                                output.push_back(vocab.special_cls_id);
         
     | 
| 
      
 11678 
     | 
    
         
            +
                            }
         
     | 
| 
      
 11679 
     | 
    
         
            +
             
     | 
| 
       10486 
11680 
     | 
    
         
             
                            for (const auto & fragment : fragment_buffer) {
         
     | 
| 
       10487 
11681 
     | 
    
         
             
                                if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
         
     | 
| 
       10488 
11682 
     | 
    
         
             
                                    auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
         
     | 
| 
         @@ -10496,6 +11690,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & 
     | 
|
| 
       10496 
11690 
     | 
    
         
             
                                    output.push_back(fragment.token);
         
     | 
| 
       10497 
11691 
     | 
    
         
             
                                }
         
     | 
| 
       10498 
11692 
     | 
    
         
             
                            }
         
     | 
| 
      
 11693 
     | 
    
         
            +
             
     | 
| 
      
 11694 
     | 
    
         
            +
                            if (add_special) {
         
     | 
| 
      
 11695 
     | 
    
         
            +
                                GGML_ASSERT(vocab.special_sep_id != -1);
         
     | 
| 
      
 11696 
     | 
    
         
            +
                                output.push_back(vocab.special_sep_id);
         
     | 
| 
      
 11697 
     | 
    
         
            +
                            }
         
     | 
| 
       10499 
11698 
     | 
    
         
             
                        } break;
         
     | 
| 
       10500 
11699 
     | 
    
         
             
                    case LLAMA_VOCAB_TYPE_NONE:
         
     | 
| 
       10501 
11700 
     | 
    
         
             
                        GGML_ASSERT(false);
         
     | 
| 
         @@ -10508,28 +11707,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & 
     | 
|
| 
       10508 
11707 
     | 
    
         
             
            // grammar - internal
         
     | 
| 
       10509 
11708 
     | 
    
         
             
            //
         
     | 
| 
       10510 
11709 
     | 
    
         | 
| 
       10511 
     | 
    
         
            -
            struct llama_partial_utf8 {
         
     | 
| 
       10512 
     | 
    
         
            -
                uint32_t value;    // bit value so far (unshifted)
         
     | 
| 
       10513 
     | 
    
         
            -
                int      n_remain; // num bytes remaining; -1 indicates invalid sequence
         
     | 
| 
       10514 
     | 
    
         
            -
            };
         
     | 
| 
       10515 
     | 
    
         
            -
             
     | 
| 
       10516 
     | 
    
         
            -
            struct llama_grammar {
         
     | 
| 
       10517 
     | 
    
         
            -
                const std::vector<std::vector<llama_grammar_element>>   rules;
         
     | 
| 
       10518 
     | 
    
         
            -
                std::vector<std::vector<const llama_grammar_element *>> stacks;
         
     | 
| 
       10519 
     | 
    
         
            -
             
     | 
| 
       10520 
     | 
    
         
            -
                // buffer for partially generated UTF-8 sequence from accepted tokens
         
     | 
| 
       10521 
     | 
    
         
            -
                llama_partial_utf8                                      partial_utf8;
         
     | 
| 
       10522 
     | 
    
         
            -
            };
         
     | 
| 
       10523 
     | 
    
         
            -
             
     | 
| 
       10524 
     | 
    
         
            -
            struct llama_grammar_candidate {
         
     | 
| 
       10525 
     | 
    
         
            -
                size_t               index;
         
     | 
| 
       10526 
     | 
    
         
            -
                const uint32_t     * code_points;
         
     | 
| 
       10527 
     | 
    
         
            -
                llama_partial_utf8   partial_utf8;
         
     | 
| 
       10528 
     | 
    
         
            -
            };
         
     | 
| 
       10529 
11710 
     | 
    
         | 
| 
       10530 
11711 
     | 
    
         
             
            // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
         
     | 
| 
       10531 
11712 
     | 
    
         
             
            // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
         
     | 
| 
       10532 
     | 
    
         
            -
             
     | 
| 
      
 11713 
     | 
    
         
            +
            std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         
     | 
| 
       10533 
11714 
     | 
    
         
             
                    const std::string & src,
         
     | 
| 
       10534 
11715 
     | 
    
         
             
                    llama_partial_utf8   partial_start) {
         
     | 
| 
       10535 
11716 
     | 
    
         
             
                static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
         
     | 
| 
         @@ -10680,7 +11861,9 @@ static void llama_grammar_advance_stack( 
     | 
|
| 
       10680 
11861 
     | 
    
         
             
                    std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
         
     | 
| 
       10681 
11862 
     | 
    
         | 
| 
       10682 
11863 
     | 
    
         
             
                if (stack.empty()) {
         
     | 
| 
       10683 
     | 
    
         
            -
                    new_stacks. 
     | 
| 
      
 11864 
     | 
    
         
            +
                    if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
         
     | 
| 
      
 11865 
     | 
    
         
            +
                        new_stacks.emplace_back(stack);
         
     | 
| 
      
 11866 
     | 
    
         
            +
                    }
         
     | 
| 
       10684 
11867 
     | 
    
         
             
                    return;
         
     | 
| 
       10685 
11868 
     | 
    
         
             
                }
         
     | 
| 
       10686 
11869 
     | 
    
         | 
| 
         @@ -10717,7 +11900,10 @@ static void llama_grammar_advance_stack( 
     | 
|
| 
       10717 
11900 
     | 
    
         
             
                    }
         
     | 
| 
       10718 
11901 
     | 
    
         
             
                    case LLAMA_GRETYPE_CHAR:
         
     | 
| 
       10719 
11902 
     | 
    
         
             
                    case LLAMA_GRETYPE_CHAR_NOT:
         
     | 
| 
       10720 
     | 
    
         
            -
                        new_stacks. 
     | 
| 
      
 11903 
     | 
    
         
            +
                        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
         
     | 
| 
      
 11904 
     | 
    
         
            +
                            // only add the stack if it's not a duplicate of one we already have
         
     | 
| 
      
 11905 
     | 
    
         
            +
                            new_stacks.emplace_back(stack);
         
     | 
| 
      
 11906 
     | 
    
         
            +
                        }
         
     | 
| 
       10721 
11907 
     | 
    
         
             
                        break;
         
     | 
| 
       10722 
11908 
     | 
    
         
             
                    default:
         
     | 
| 
       10723 
11909 
     | 
    
         
             
                        // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
         
     | 
| 
         @@ -10731,12 +11917,13 @@ static void llama_grammar_advance_stack( 
     | 
|
| 
       10731 
11917 
     | 
    
         
             
            // be positioned at a character range (see `llama_grammar_advance_stack`), and
         
     | 
| 
       10732 
11918 
     | 
    
         
             
            // produces the N possible stacks if the given char is accepted at those
         
     | 
| 
       10733 
11919 
     | 
    
         
             
            // positions
         
     | 
| 
       10734 
     | 
    
         
            -
             
     | 
| 
      
 11920 
     | 
    
         
            +
            void llama_grammar_accept(
         
     | 
| 
       10735 
11921 
     | 
    
         
             
                    const std::vector<std::vector<llama_grammar_element>>         & rules,
         
     | 
| 
       10736 
11922 
     | 
    
         
             
                    const std::vector<std::vector<const llama_grammar_element *>> & stacks,
         
     | 
| 
       10737 
     | 
    
         
            -
                    const uint32_t                                                  chr 
     | 
| 
      
 11923 
     | 
    
         
            +
                    const uint32_t                                                  chr,
         
     | 
| 
      
 11924 
     | 
    
         
            +
                    std::vector<std::vector<const llama_grammar_element *>>       & new_stacks) {
         
     | 
| 
       10738 
11925 
     | 
    
         | 
| 
       10739 
     | 
    
         
            -
                 
     | 
| 
      
 11926 
     | 
    
         
            +
                new_stacks.clear();
         
     | 
| 
       10740 
11927 
     | 
    
         | 
| 
       10741 
11928 
     | 
    
         
             
                for (const auto & stack : stacks) {
         
     | 
| 
       10742 
11929 
     | 
    
         
             
                    if (stack.empty()) {
         
     | 
| 
         @@ -10755,8 +11942,6 @@ static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_acc 
     | 
|
| 
       10755 
11942 
     | 
    
         
             
                        llama_grammar_advance_stack(rules, new_stack, new_stacks);
         
     | 
| 
       10756 
11943 
     | 
    
         
             
                    }
         
     | 
| 
       10757 
11944 
     | 
    
         
             
                }
         
     | 
| 
       10758 
     | 
    
         
            -
             
     | 
| 
       10759 
     | 
    
         
            -
                return new_stacks;
         
     | 
| 
       10760 
11945 
     | 
    
         
             
            }
         
     | 
| 
       10761 
11946 
     | 
    
         | 
| 
       10762 
11947 
     | 
    
         
             
            static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
         
     | 
| 
         @@ -10770,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_ 
     | 
|
| 
       10770 
11955 
     | 
    
         
             
                    const std::vector<llama_grammar_candidate>            & candidates) {
         
     | 
| 
       10771 
11956 
     | 
    
         | 
| 
       10772 
11957 
     | 
    
         
             
                std::vector<llama_grammar_candidate> rejects;
         
     | 
| 
      
 11958 
     | 
    
         
            +
                rejects.reserve(candidates.size());
         
     | 
| 
       10773 
11959 
     | 
    
         | 
| 
       10774 
11960 
     | 
    
         
             
                if (stack.empty()) {
         
     | 
| 
       10775 
11961 
     | 
    
         
             
                    for (const auto & tok : candidates) {
         
     | 
| 
         @@ -10783,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_ 
     | 
|
| 
       10783 
11969 
     | 
    
         
             
                const llama_grammar_element * stack_pos = stack.back();
         
     | 
| 
       10784 
11970 
     | 
    
         | 
| 
       10785 
11971 
     | 
    
         
             
                std::vector<llama_grammar_candidate> next_candidates;
         
     | 
| 
      
 11972 
     | 
    
         
            +
                next_candidates.reserve(candidates.size());
         
     | 
| 
      
 11973 
     | 
    
         
            +
             
     | 
| 
       10786 
11974 
     | 
    
         
             
                for (const auto & tok : candidates) {
         
     | 
| 
       10787 
11975 
     | 
    
         
             
                    if (*tok.code_points == 0) {
         
     | 
| 
       10788 
11976 
     | 
    
         
             
                        // reached end of full codepoints in token, reject iff it ended in a partial sequence
         
     | 
| 
         @@ -11590,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar 
     | 
|
| 
       11590 
12778 
     | 
    
         
             
                // Note terminating 0 in decoded string
         
     | 
| 
       11591 
12779 
     | 
    
         
             
                const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
         
     | 
| 
       11592 
12780 
     | 
    
         
             
                const auto & code_points = decoded.first;
         
     | 
| 
      
 12781 
     | 
    
         
            +
                std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
         
     | 
| 
       11593 
12782 
     | 
    
         
             
                for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
         
     | 
| 
       11594 
     | 
    
         
            -
                     
     | 
| 
      
 12783 
     | 
    
         
            +
                    llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
         
     | 
| 
      
 12784 
     | 
    
         
            +
                    grammar->stacks = tmp_new_stacks;
         
     | 
| 
       11595 
12785 
     | 
    
         
             
                }
         
     | 
| 
       11596 
12786 
     | 
    
         
             
                grammar->partial_utf8 = decoded.second;
         
     | 
| 
       11597 
12787 
     | 
    
         
             
                GGML_ASSERT(!grammar->stacks.empty());
         
     | 
| 
         @@ -11957,7 +13147,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       11957 
13147 
     | 
    
         
             
                        // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
         
     | 
| 
       11958 
13148 
     | 
    
         
             
                        // for getting the current layer as I initially thought, and we need to resort to parsing the
         
     | 
| 
       11959 
13149 
     | 
    
         
             
                        // tensor name.
         
     | 
| 
       11960 
     | 
    
         
            -
                        n_layer /= n_expert;
         
     | 
| 
       11961 
13150 
     | 
    
         
             
                        if (sscanf(name, "blk.%d.", &i_layer) != 1) {
         
     | 
| 
       11962 
13151 
     | 
    
         
             
                            throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
         
     | 
| 
       11963 
13152 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -11971,30 +13160,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       11971 
13160 
     | 
    
         
             
                // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
         
     | 
| 
       11972 
13161 
     | 
    
         
             
                // with the quantization of the output tensor
         
     | 
| 
       11973 
13162 
     | 
    
         
             
                if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
         
     | 
| 
       11974 
     | 
    
         
            -
                     
     | 
| 
       11975 
     | 
    
         
            -
             
     | 
| 
       11976 
     | 
    
         
            -
             
     | 
| 
       11977 
     | 
    
         
            -
             
     | 
| 
       11978 
     | 
    
         
            -
             
     | 
| 
       11979 
     | 
    
         
            -
             
     | 
| 
       11980 
     | 
    
         
            -
                         
     | 
| 
       11981 
     | 
    
         
            -
             
     | 
| 
       11982 
     | 
    
         
            -
             
     | 
| 
       11983 
     | 
    
         
            -
             
     | 
| 
      
 13163 
     | 
    
         
            +
                    if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
         
     | 
| 
      
 13164 
     | 
    
         
            +
                        new_type = qs.params->output_tensor_type;
         
     | 
| 
      
 13165 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 13166 
     | 
    
         
            +
                        int nx = tensor->ne[0];
         
     | 
| 
      
 13167 
     | 
    
         
            +
                        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
         
     | 
| 
      
 13168 
     | 
    
         
            +
                            new_type = GGML_TYPE_Q8_0;
         
     | 
| 
      
 13169 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13170 
     | 
    
         
            +
                        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
         
     | 
| 
      
 13171 
     | 
    
         
            +
                                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
         
     | 
| 
      
 13172 
     | 
    
         
            +
                                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         
     | 
| 
      
 13173 
     | 
    
         
            +
                            new_type = GGML_TYPE_Q5_K;
         
     | 
| 
      
 13174 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13175 
     | 
    
         
            +
                        else if (new_type != GGML_TYPE_Q8_0) {
         
     | 
| 
      
 13176 
     | 
    
         
            +
                            new_type = GGML_TYPE_Q6_K;
         
     | 
| 
      
 13177 
     | 
    
         
            +
                        }
         
     | 
| 
       11984 
13178 
     | 
    
         
             
                    }
         
     | 
| 
       11985 
13179 
     | 
    
         
             
                } else if (name == "token_embd.weight") {
         
     | 
| 
       11986 
     | 
    
         
            -
                    if ( 
     | 
| 
       11987 
     | 
    
         
            -
                         
     | 
| 
       11988 
     | 
    
         
            -
             
     | 
| 
       11989 
     | 
    
         
            -
             
     | 
| 
       11990 
     | 
    
         
            -
             
     | 
| 
       11991 
     | 
    
         
            -
             
     | 
| 
       11992 
     | 
    
         
            -
             
     | 
| 
       11993 
     | 
    
         
            -
             
     | 
| 
       11994 
     | 
    
         
            -
             
     | 
| 
      
 13180 
     | 
    
         
            +
                    if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
         
     | 
| 
      
 13181 
     | 
    
         
            +
                        new_type = qs.params->token_embedding_type;
         
     | 
| 
      
 13182 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 13183 
     | 
    
         
            +
                        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
         
     | 
| 
      
 13184 
     | 
    
         
            +
                            ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         
     | 
| 
      
 13185 
     | 
    
         
            +
                            new_type = GGML_TYPE_Q2_K;
         
     | 
| 
      
 13186 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13187 
     | 
    
         
            +
                        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
         
     | 
| 
      
 13188 
     | 
    
         
            +
                            new_type = GGML_TYPE_IQ3_S;
         
     | 
| 
      
 13189 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13190 
     | 
    
         
            +
                        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
         
     | 
| 
      
 13191 
     | 
    
         
            +
                            new_type = GGML_TYPE_IQ3_S;
         
     | 
| 
      
 13192 
     | 
    
         
            +
                        }
         
     | 
| 
       11995 
13193 
     | 
    
         
             
                    }
         
     | 
| 
       11996 
13194 
     | 
    
         
             
                } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
         
     | 
| 
       11997 
     | 
    
         
            -
                           ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
         
     | 
| 
      
 13195 
     | 
    
         
            +
                           ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         
     | 
| 
       11998 
13196 
     | 
    
         
             
                    if (name.find("attn_v.weight") != std::string::npos) {
         
     | 
| 
       11999 
13197 
     | 
    
         
             
                        if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       12000 
13198 
     | 
    
         
             
                        else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
         
     | 
| 
         @@ -12013,7 +13211,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       12013 
13211 
     | 
    
         
             
                        if (qs.model.hparams.n_expert == 8) {
         
     | 
| 
       12014 
13212 
     | 
    
         
             
                            new_type = GGML_TYPE_Q5_K;
         
     | 
| 
       12015 
13213 
     | 
    
         
             
                        } else {
         
     | 
| 
       12016 
     | 
    
         
            -
                            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
         
     | 
| 
      
 13214 
     | 
    
         
            +
                            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
         
     | 
| 
       12017 
13215 
     | 
    
         
             
                            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
         
     | 
| 
       12018 
13216 
     | 
    
         
             
                        }
         
     | 
| 
       12019 
13217 
     | 
    
         
             
                    }
         
     | 
| 
         @@ -12027,13 +13225,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       12027 
13225 
     | 
    
         
             
                    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
         
     | 
| 
       12028 
13226 
     | 
    
         
             
                        new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
         
     | 
| 
       12029 
13227 
     | 
    
         
             
                    }
         
     | 
| 
       12030 
     | 
    
         
            -
                    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
         
     | 
| 
       12031 
     | 
    
         
            -
                        new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       12032 
     | 
    
         
            -
                    }
         
     | 
| 
       12033 
     | 
    
         
            -
                    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
         
     | 
| 
       12034 
     | 
    
         
            -
                        new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       12035 
     | 
    
         
            -
                    }
         
     | 
| 
       12036 
     | 
    
         
            -
                    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
         
     | 
| 
      
 13228 
     | 
    
         
            +
                    else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
         
     | 
| 
       12037 
13229 
     | 
    
         
             
                        new_type = GGML_TYPE_Q4_K;
         
     | 
| 
       12038 
13230 
     | 
    
         
             
                    }
         
     | 
| 
       12039 
13231 
     | 
    
         
             
                    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
         
     | 
| 
         @@ -12186,7 +13378,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       12186 
13378 
     | 
    
         
             
                if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
         
     | 
| 
       12187 
13379 
     | 
    
         
             
                    new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
         
     | 
| 
       12188 
13380 
     | 
    
         
             
                    new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
         
     | 
| 
       12189 
     | 
    
         
            -
                    new_type == GGML_TYPE_IQ3_XXS ||  
     | 
| 
      
 13381 
     | 
    
         
            +
                    new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
         
     | 
| 
      
 13382 
     | 
    
         
            +
                    new_type == GGML_TYPE_IQ1_M) {
         
     | 
| 
       12190 
13383 
     | 
    
         
             
                    int nx = tensor->ne[0];
         
     | 
| 
       12191 
13384 
     | 
    
         
             
                    int ny = tensor->ne[1];
         
     | 
| 
       12192 
13385 
     | 
    
         
             
                    if (nx % QK_K != 0) {
         
     | 
| 
         @@ -12204,6 +13397,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       12204 
13397 
     | 
    
         
             
                        case GGML_TYPE_IQ3_XXS:
         
     | 
| 
       12205 
13398 
     | 
    
         
             
                        case GGML_TYPE_IQ3_S:
         
     | 
| 
       12206 
13399 
     | 
    
         
             
                        case GGML_TYPE_IQ1_S:
         
     | 
| 
      
 13400 
     | 
    
         
            +
                        case GGML_TYPE_IQ1_M:
         
     | 
| 
       12207 
13401 
     | 
    
         
             
                        case GGML_TYPE_Q2_K:
         
     | 
| 
       12208 
13402 
     | 
    
         
             
                        case GGML_TYPE_Q3_K:
         
     | 
| 
       12209 
13403 
     | 
    
         
             
                        case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
         
     | 
| 
         @@ -12219,9 +13413,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n 
     | 
|
| 
       12219 
13413 
     | 
    
         
             
                return new_type;
         
     | 
| 
       12220 
13414 
     | 
    
         
             
            }
         
     | 
| 
       12221 
13415 
     | 
    
         | 
| 
       12222 
     | 
    
         
            -
            static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const  
     | 
| 
      
 13416 
     | 
    
         
            +
            static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
         
     | 
| 
       12223 
13417 
     | 
    
         
             
                std::mutex mutex;
         
     | 
| 
       12224 
     | 
    
         
            -
                 
     | 
| 
      
 13418 
     | 
    
         
            +
                int64_t counter = 0;
         
     | 
| 
       12225 
13419 
     | 
    
         
             
                size_t new_size = 0;
         
     | 
| 
       12226 
13420 
     | 
    
         
             
                if (nthread < 2) {
         
     | 
| 
       12227 
13421 
     | 
    
         
             
                    // single-thread
         
     | 
| 
         @@ -12229,11 +13423,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa 
     | 
|
| 
       12229 
13423 
     | 
    
         
             
                }
         
     | 
| 
       12230 
13424 
     | 
    
         
             
                auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
         
     | 
| 
       12231 
13425 
     | 
    
         
             
                        nrows, n_per_row, imatrix]() {
         
     | 
| 
       12232 
     | 
    
         
            -
                    const  
     | 
| 
      
 13426 
     | 
    
         
            +
                    const int64_t nrows_per_chunk = chunk_size / n_per_row;
         
     | 
| 
       12233 
13427 
     | 
    
         
             
                    size_t local_size = 0;
         
     | 
| 
       12234 
13428 
     | 
    
         
             
                    while (true) {
         
     | 
| 
       12235 
13429 
     | 
    
         
             
                        std::unique_lock<std::mutex> lock(mutex);
         
     | 
| 
       12236 
     | 
    
         
            -
                         
     | 
| 
      
 13430 
     | 
    
         
            +
                        int64_t first_row = counter; counter += nrows_per_chunk;
         
     | 
| 
       12237 
13431 
     | 
    
         
             
                        if (first_row >= nrows) {
         
     | 
| 
       12238 
13432 
     | 
    
         
             
                            if (local_size > 0) {
         
     | 
| 
       12239 
13433 
     | 
    
         
             
                                new_size += local_size;
         
     | 
| 
         @@ -12241,7 +13435,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa 
     | 
|
| 
       12241 
13435 
     | 
    
         
             
                            break;
         
     | 
| 
       12242 
13436 
     | 
    
         
             
                        }
         
     | 
| 
       12243 
13437 
     | 
    
         
             
                        lock.unlock();
         
     | 
| 
       12244 
     | 
    
         
            -
                        const  
     | 
| 
      
 13438 
     | 
    
         
            +
                        const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
         
     | 
| 
       12245 
13439 
     | 
    
         
             
                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
         
     | 
| 
       12246 
13440 
     | 
    
         
             
                    }
         
     | 
| 
       12247 
13441 
     | 
    
         
             
                };
         
     | 
| 
         @@ -12285,6 +13479,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12285 
13479 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
         
     | 
| 
       12286 
13480 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
         
     | 
| 
       12287 
13481 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
         
     | 
| 
      
 13482 
     | 
    
         
            +
                    case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
         
     | 
| 
       12288 
13483 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
         
     | 
| 
       12289 
13484 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         
     | 
| 
       12290 
13485 
     | 
    
         
             
                    case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         
     | 
| 
         @@ -12307,8 +13502,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12307 
13502 
     | 
    
         
             
                constexpr bool use_mmap = false;
         
     | 
| 
       12308 
13503 
     | 
    
         
             
            #endif
         
     | 
| 
       12309 
13504 
     | 
    
         | 
| 
       12310 
     | 
    
         
            -
                 
     | 
| 
       12311 
     | 
    
         
            -
                 
     | 
| 
      
 13505 
     | 
    
         
            +
                llama_model_kv_override * kv_overrides = nullptr;
         
     | 
| 
      
 13506 
     | 
    
         
            +
                if (params->kv_overrides) {
         
     | 
| 
      
 13507 
     | 
    
         
            +
                    auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
         
     | 
| 
      
 13508 
     | 
    
         
            +
                    kv_overrides = v->data();
         
     | 
| 
      
 13509 
     | 
    
         
            +
                }
         
     | 
| 
      
 13510 
     | 
    
         
            +
                llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
         
     | 
| 
      
 13511 
     | 
    
         
            +
                ml.init_mappings(false); // no prefetching
         
     | 
| 
       12312 
13512 
     | 
    
         | 
| 
       12313 
13513 
     | 
    
         
             
                llama_model model;
         
     | 
| 
       12314 
13514 
     | 
    
         
             
                llm_load_arch(ml, model);
         
     | 
| 
         @@ -12332,36 +13532,48 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12332 
13532 
     | 
    
         
             
                struct gguf_context * ctx_out = gguf_init_empty();
         
     | 
| 
       12333 
13533 
     | 
    
         | 
| 
       12334 
13534 
     | 
    
         
             
                // copy the KV pairs from the input file
         
     | 
| 
       12335 
     | 
    
         
            -
                gguf_set_kv     (ctx_out, ml. 
     | 
| 
      
 13535 
     | 
    
         
            +
                gguf_set_kv     (ctx_out, ml.meta);
         
     | 
| 
       12336 
13536 
     | 
    
         
             
                gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
         
     | 
| 
       12337 
13537 
     | 
    
         
             
                gguf_set_val_u32(ctx_out, "general.file_type", ftype);
         
     | 
| 
       12338 
13538 
     | 
    
         | 
| 
      
 13539 
     | 
    
         
            +
                if (params->kv_overrides) {
         
     | 
| 
      
 13540 
     | 
    
         
            +
                    const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
         
     | 
| 
      
 13541 
     | 
    
         
            +
                    for (auto & o : overrides) {
         
     | 
| 
      
 13542 
     | 
    
         
            +
                        if (o.key[0] == 0) break;
         
     | 
| 
      
 13543 
     | 
    
         
            +
                        if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
         
     | 
| 
      
 13544 
     | 
    
         
            +
                            gguf_set_val_f32(ctx_out, o.key, o.float_value);
         
     | 
| 
      
 13545 
     | 
    
         
            +
                        } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
         
     | 
| 
      
 13546 
     | 
    
         
            +
                            gguf_set_val_i32(ctx_out, o.key, o.int_value);
         
     | 
| 
      
 13547 
     | 
    
         
            +
                        } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
         
     | 
| 
      
 13548 
     | 
    
         
            +
                            gguf_set_val_bool(ctx_out, o.key, o.bool_value);
         
     | 
| 
      
 13549 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 13550 
     | 
    
         
            +
                            LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
         
     | 
| 
      
 13551 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13552 
     | 
    
         
            +
                    }
         
     | 
| 
      
 13553 
     | 
    
         
            +
                }
         
     | 
| 
      
 13554 
     | 
    
         
            +
             
     | 
| 
       12339 
13555 
     | 
    
         
             
                for (int i = 0; i < ml.n_tensors; ++i) {
         
     | 
| 
       12340 
     | 
    
         
            -
                    struct ggml_tensor * meta = ml.get_tensor_meta(i);
         
     | 
| 
      
 13556 
     | 
    
         
            +
                    const struct ggml_tensor * meta = ml.get_tensor_meta(i);
         
     | 
| 
       12341 
13557 
     | 
    
         | 
| 
       12342 
13558 
     | 
    
         
             
                    const std::string name = ggml_get_name(meta);
         
     | 
| 
       12343 
13559 
     | 
    
         | 
| 
       12344 
13560 
     | 
    
         
             
                    // TODO: avoid hardcoded tensor names - use the TN_* constants
         
     | 
| 
       12345 
     | 
    
         
            -
                    if (name.find("attn_v.weight") 
     | 
| 
      
 13561 
     | 
    
         
            +
                    if (name.find("attn_v.weight")   != std::string::npos ||
         
     | 
| 
      
 13562 
     | 
    
         
            +
                        name.find("attn_qkv.weight") != std::string::npos) {
         
     | 
| 
       12346 
13563 
     | 
    
         
             
                        ++qs.n_attention_wv;
         
     | 
| 
       12347 
     | 
    
         
            -
                    }
         
     | 
| 
       12348 
     | 
    
         
            -
                    else if (name.find("ffn_down") != std::string::npos) {
         
     | 
| 
       12349 
     | 
    
         
            -
                        ++qs.n_ffn_down;
         
     | 
| 
       12350 
     | 
    
         
            -
                    }
         
     | 
| 
       12351 
     | 
    
         
            -
                    else if (name.find("ffn_gate") != std::string::npos) {
         
     | 
| 
       12352 
     | 
    
         
            -
                        ++qs.n_ffn_gate;
         
     | 
| 
       12353 
     | 
    
         
            -
                    }
         
     | 
| 
       12354 
     | 
    
         
            -
                    else if (name.find("ffn_up") != std::string::npos) {
         
     | 
| 
       12355 
     | 
    
         
            -
                        ++qs.n_ffn_up;
         
     | 
| 
       12356 
     | 
    
         
            -
                    }
         
     | 
| 
       12357 
     | 
    
         
            -
                    else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
         
     | 
| 
      
 13564 
     | 
    
         
            +
                    } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
         
     | 
| 
       12358 
13565 
     | 
    
         
             
                        qs.has_output = true;
         
     | 
| 
       12359 
13566 
     | 
    
         
             
                    }
         
     | 
| 
       12360 
13567 
     | 
    
         
             
                }
         
     | 
| 
       12361 
     | 
    
         
            -
             
     | 
| 
       12362 
     | 
    
         
            -
             
     | 
| 
       12363 
     | 
    
         
            -
             
     | 
| 
       12364 
     | 
    
         
            -
                 
     | 
| 
      
 13568 
     | 
    
         
            +
             
     | 
| 
      
 13569 
     | 
    
         
            +
                qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
         
     | 
| 
      
 13570 
     | 
    
         
            +
             
     | 
| 
      
 13571 
     | 
    
         
            +
                // sanity checks
         
     | 
| 
      
 13572 
     | 
    
         
            +
                //
         
     | 
| 
      
 13573 
     | 
    
         
            +
                //  - qs.n_attention_wv == 0                     for Mamba       models
         
     | 
| 
      
 13574 
     | 
    
         
            +
                //  - qs.n_attention_wv == model.hparams.n_layer for Transformer models
         
     | 
| 
      
 13575 
     | 
    
         
            +
                //
         
     | 
| 
      
 13576 
     | 
    
         
            +
                GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
         
     | 
| 
       12365 
13577 
     | 
    
         | 
| 
       12366 
13578 
     | 
    
         
             
                size_t total_size_org = 0;
         
     | 
| 
       12367 
13579 
     | 
    
         
             
                size_t total_size_new = 0;
         
     | 
| 
         @@ -12377,7 +13589,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12377 
13589 
     | 
    
         | 
| 
       12378 
13590 
     | 
    
         
             
                // populate the original tensors so we get an initial meta data
         
     | 
| 
       12379 
13591 
     | 
    
         
             
                for (int i = 0; i < ml.n_tensors; ++i) {
         
     | 
| 
       12380 
     | 
    
         
            -
                    struct ggml_tensor * meta = ml.get_tensor_meta(i);
         
     | 
| 
      
 13592 
     | 
    
         
            +
                    const struct ggml_tensor * meta = ml.get_tensor_meta(i);
         
     | 
| 
       12381 
13593 
     | 
    
         
             
                    gguf_add_tensor(ctx_out, meta);
         
     | 
| 
       12382 
13594 
     | 
    
         
             
                }
         
     | 
| 
       12383 
13595 
     | 
    
         | 
| 
         @@ -12391,6 +13603,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12391 
13603 
     | 
    
         
             
                // placeholder for the meta data
         
     | 
| 
       12392 
13604 
     | 
    
         
             
                ::zeros(fout, meta_size);
         
     | 
| 
       12393 
13605 
     | 
    
         | 
| 
      
 13606 
     | 
    
         
            +
                const auto tn = LLM_TN(model.arch);
         
     | 
| 
      
 13607 
     | 
    
         
            +
             
     | 
| 
       12394 
13608 
     | 
    
         
             
                for (int i = 0; i < ml.n_tensors; ++i) {
         
     | 
| 
       12395 
13609 
     | 
    
         
             
                    struct ggml_tensor * tensor = ml.get_tensor_meta(i);
         
     | 
| 
       12396 
13610 
     | 
    
         | 
| 
         @@ -12413,8 +13627,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12413 
13627 
     | 
    
         
             
                    // This used to be a regex, but <regex> has an extreme cost to compile times.
         
     | 
| 
       12414 
13628 
     | 
    
         
             
                    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
         
     | 
| 
       12415 
13629 
     | 
    
         | 
| 
       12416 
     | 
    
         
            -
                    // quantize only 2D tensors
         
     | 
| 
       12417 
     | 
    
         
            -
                    quantize &= (ggml_n_dims(tensor)  
     | 
| 
      
 13630 
     | 
    
         
            +
                    // quantize only 2D and 3D tensors (experts)
         
     | 
| 
      
 13631 
     | 
    
         
            +
                    quantize &= (ggml_n_dims(tensor) >= 2);
         
     | 
| 
      
 13632 
     | 
    
         
            +
             
     | 
| 
      
 13633 
     | 
    
         
            +
                    // do not quantize norm tensors
         
     | 
| 
      
 13634 
     | 
    
         
            +
                    quantize &= name.find("_norm.weight") == std::string::npos;
         
     | 
| 
      
 13635 
     | 
    
         
            +
             
     | 
| 
       12418 
13636 
     | 
    
         
             
                    quantize &= params->quantize_output_tensor || name != "output.weight";
         
     | 
| 
       12419 
13637 
     | 
    
         
             
                    quantize &= !params->only_copy;
         
     | 
| 
       12420 
13638 
     | 
    
         | 
| 
         @@ -12443,6 +13661,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12443 
13661 
     | 
    
         
             
                        if (!params->pure && ggml_is_quantized(default_type)) {
         
     | 
| 
       12444 
13662 
     | 
    
         
             
                            new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
         
     | 
| 
       12445 
13663 
     | 
    
         
             
                        }
         
     | 
| 
      
 13664 
     | 
    
         
            +
                        if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
         
     | 
| 
      
 13665 
     | 
    
         
            +
                            new_type = params->token_embedding_type;
         
     | 
| 
      
 13666 
     | 
    
         
            +
                        }
         
     | 
| 
      
 13667 
     | 
    
         
            +
                        if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
         
     | 
| 
      
 13668 
     | 
    
         
            +
                            new_type = params->output_tensor_type;
         
     | 
| 
      
 13669 
     | 
    
         
            +
                        }
         
     | 
| 
       12446 
13670 
     | 
    
         | 
| 
       12447 
13671 
     | 
    
         
             
                        // If we've decided to quantize to the same type the tensor is already
         
     | 
| 
       12448 
13672 
     | 
    
         
             
                        // in then there's nothing to do.
         
     | 
| 
         @@ -12455,7 +13679,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12455 
13679 
     | 
    
         
             
                        new_size = ggml_nbytes(tensor);
         
     | 
| 
       12456 
13680 
     | 
    
         
             
                        LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
         
     | 
| 
       12457 
13681 
     | 
    
         
             
                    } else {
         
     | 
| 
       12458 
     | 
    
         
            -
                        const  
     | 
| 
      
 13682 
     | 
    
         
            +
                        const int64_t nelements = ggml_nelements(tensor);
         
     | 
| 
       12459 
13683 
     | 
    
         | 
| 
       12460 
13684 
     | 
    
         
             
                        const float * imatrix = nullptr;
         
     | 
| 
       12461 
13685 
     | 
    
         
             
                        if (imatrix_data) {
         
     | 
| 
         @@ -12463,11 +13687,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12463 
13687 
     | 
    
         
             
                            if (it == imatrix_data->end()) {
         
     | 
| 
       12464 
13688 
     | 
    
         
             
                                LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
         
     | 
| 
       12465 
13689 
     | 
    
         
             
                            } else {
         
     | 
| 
       12466 
     | 
    
         
            -
                                if (it->second.size() == (size_t)tensor->ne[0]) {
         
     | 
| 
      
 13690 
     | 
    
         
            +
                                if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
         
     | 
| 
       12467 
13691 
     | 
    
         
             
                                    imatrix = it->second.data();
         
     | 
| 
       12468 
13692 
     | 
    
         
             
                                } else {
         
     | 
| 
       12469 
13693 
     | 
    
         
             
                                    LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
         
     | 
| 
       12470 
     | 
    
         
            -
                                            int(it->second.size()), int(tensor->ne[0]), tensor->name);
         
     | 
| 
      
 13694 
     | 
    
         
            +
                                            int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
         
     | 
| 
      
 13695 
     | 
    
         
            +
             
     | 
| 
      
 13696 
     | 
    
         
            +
                                    // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
         
     | 
| 
      
 13697 
     | 
    
         
            +
                                    // this is a significant error and it may be good idea to abort the process if this happens,
         
     | 
| 
      
 13698 
     | 
    
         
            +
                                    // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
         
     | 
| 
      
 13699 
     | 
    
         
            +
                                    // tok_embd should be ignored in this case, since it always causes this warning
         
     | 
| 
      
 13700 
     | 
    
         
            +
                                    if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
         
     | 
| 
      
 13701 
     | 
    
         
            +
                                        throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
         
     | 
| 
      
 13702 
     | 
    
         
            +
                                                int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
         
     | 
| 
      
 13703 
     | 
    
         
            +
                                    }
         
     | 
| 
       12471 
13704 
     | 
    
         
             
                                }
         
     | 
| 
       12472 
13705 
     | 
    
         
             
                            }
         
     | 
| 
       12473 
13706 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -12475,6 +13708,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12475 
13708 
     | 
    
         
             
                             new_type == GGML_TYPE_IQ2_XS  ||
         
     | 
| 
       12476 
13709 
     | 
    
         
             
                             new_type == GGML_TYPE_IQ2_S   ||
         
     | 
| 
       12477 
13710 
     | 
    
         
             
                             new_type == GGML_TYPE_IQ1_S   ||
         
     | 
| 
      
 13711 
     | 
    
         
            +
                            (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
         
     | 
| 
       12478 
13712 
     | 
    
         
             
                            (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
         
     | 
| 
       12479 
13713 
     | 
    
         
             
                            LLAMA_LOG_ERROR("\n\n============================================================\n");
         
     | 
| 
       12480 
13714 
     | 
    
         
             
                            LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
         
     | 
| 
         @@ -12497,21 +13731,30 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s 
     | 
|
| 
       12497 
13731 
     | 
    
         
             
                        LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
         
     | 
| 
       12498 
13732 
     | 
    
         
             
                        fflush(stdout);
         
     | 
| 
       12499 
13733 
     | 
    
         | 
| 
       12500 
     | 
    
         
            -
                        if (work.size() < nelements * 4) {
         
     | 
| 
      
 13734 
     | 
    
         
            +
                        if (work.size() < (size_t)nelements * 4) {
         
     | 
| 
       12501 
13735 
     | 
    
         
             
                            work.resize(nelements * 4); // upper bound on size
         
     | 
| 
       12502 
13736 
     | 
    
         
             
                        }
         
     | 
| 
       12503 
13737 
     | 
    
         
             
                        new_data = work.data();
         
     | 
| 
       12504 
13738 
     | 
    
         | 
| 
       12505 
     | 
    
         
            -
                        const  
     | 
| 
       12506 
     | 
    
         
            -
                        const  
     | 
| 
      
 13739 
     | 
    
         
            +
                        const int64_t n_per_row = tensor->ne[0];
         
     | 
| 
      
 13740 
     | 
    
         
            +
                        const int64_t nrows = tensor->ne[1];
         
     | 
| 
      
 13741 
     | 
    
         
            +
             
     | 
| 
      
 13742 
     | 
    
         
            +
                        static const int64_t min_chunk_size = 32 * 512;
         
     | 
| 
      
 13743 
     | 
    
         
            +
                        const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
         
     | 
| 
       12507 
13744 
     | 
    
         | 
| 
       12508 
     | 
    
         
            -
                         
     | 
| 
       12509 
     | 
    
         
            -
                        const  
     | 
| 
      
 13745 
     | 
    
         
            +
                        const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
         
     | 
| 
      
 13746 
     | 
    
         
            +
                        const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
         
     | 
| 
      
 13747 
     | 
    
         
            +
                        const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
         
     | 
| 
       12510 
13748 
     | 
    
         | 
| 
       12511 
     | 
    
         
            -
                         
     | 
| 
       12512 
     | 
    
         
            -
                         
     | 
| 
       12513 
     | 
    
         
            -
                         
     | 
| 
      
 13749 
     | 
    
         
            +
                        // quantize each expert separately since they have different importance matrices
         
     | 
| 
      
 13750 
     | 
    
         
            +
                        new_size = 0;
         
     | 
| 
      
 13751 
     | 
    
         
            +
                        for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
         
     | 
| 
      
 13752 
     | 
    
         
            +
                            const float * f32_data_03 = f32_data + i03 * nelements_matrix;
         
     | 
| 
      
 13753 
     | 
    
         
            +
                            void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
         
     | 
| 
      
 13754 
     | 
    
         
            +
                            const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
         
     | 
| 
       12514 
13755 
     | 
    
         | 
| 
      
 13756 
     | 
    
         
            +
                            new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
         
     | 
| 
      
 13757 
     | 
    
         
            +
                        }
         
     | 
| 
       12515 
13758 
     | 
    
         
             
                        LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
         
     | 
| 
       12516 
13759 
     | 
    
         
             
                    }
         
     | 
| 
       12517 
13760 
     | 
    
         
             
                    total_size_org += ggml_nbytes(tensor);
         
     | 
| 
         @@ -12582,7 +13825,7 @@ static int llama_apply_lora_from_file_internal( 
     | 
|
| 
       12582 
13825 
     | 
    
         
             
                if (path_base_model) {
         
     | 
| 
       12583 
13826 
     | 
    
         
             
                    LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
         
     | 
| 
       12584 
13827 
     | 
    
         
             
                    ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
         
     | 
| 
       12585 
     | 
    
         
            -
                    ml-> 
     | 
| 
      
 13828 
     | 
    
         
            +
                    ml->init_mappings(/*prefetch*/ false); // no prefetching
         
     | 
| 
       12586 
13829 
     | 
    
         
             
                }
         
     | 
| 
       12587 
13830 
     | 
    
         | 
| 
       12588 
13831 
     | 
    
         
             
                struct tensor_meta {
         
     | 
| 
         @@ -12703,7 +13946,7 @@ static int llama_apply_lora_from_file_internal( 
     | 
|
| 
       12703 
13946 
     | 
    
         | 
| 
       12704 
13947 
     | 
    
         
             
                    ggml_tensor * base_t;
         
     | 
| 
       12705 
13948 
     | 
    
         
             
                    if (ml) {
         
     | 
| 
       12706 
     | 
    
         
            -
                        if ( 
     | 
| 
      
 13949 
     | 
    
         
            +
                        if (!ml->get_tensor_meta(base_name.c_str())) {
         
     | 
| 
       12707 
13950 
     | 
    
         
             
                            LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
         
     | 
| 
       12708 
13951 
     | 
    
         
             
                            return 1;
         
     | 
| 
       12709 
13952 
     | 
    
         
             
                        }
         
     | 
| 
         @@ -12887,11 +14130,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { 
     | 
|
| 
       12887 
14130 
     | 
    
         
             
                struct llama_model_quantize_params result = {
         
     | 
| 
       12888 
14131 
     | 
    
         
             
                    /*.nthread                     =*/ 0,
         
     | 
| 
       12889 
14132 
     | 
    
         
             
                    /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
         
     | 
| 
      
 14133 
     | 
    
         
            +
                    /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
         
     | 
| 
      
 14134 
     | 
    
         
            +
                    /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
         
     | 
| 
       12890 
14135 
     | 
    
         
             
                    /*.allow_requantize            =*/ false,
         
     | 
| 
       12891 
14136 
     | 
    
         
             
                    /*.quantize_output_tensor      =*/ true,
         
     | 
| 
       12892 
14137 
     | 
    
         
             
                    /*.only_copy                   =*/ false,
         
     | 
| 
       12893 
14138 
     | 
    
         
             
                    /*.pure                        =*/ false,
         
     | 
| 
       12894 
14139 
     | 
    
         
             
                    /*.imatrix                     =*/ nullptr,
         
     | 
| 
      
 14140 
     | 
    
         
            +
                    /*.kv_overrides                =*/ nullptr,
         
     | 
| 
       12895 
14141 
     | 
    
         
             
                };
         
     | 
| 
       12896 
14142 
     | 
    
         | 
| 
       12897 
14143 
     | 
    
         
             
                return result;
         
     | 
| 
         @@ -12900,7 +14146,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { 
     | 
|
| 
       12900 
14146 
     | 
    
         
             
            size_t llama_max_devices(void) {
         
     | 
| 
       12901 
14147 
     | 
    
         
             
            #if defined(GGML_USE_METAL)
         
     | 
| 
       12902 
14148 
     | 
    
         
             
                return 1;
         
     | 
| 
       12903 
     | 
    
         
            -
            #elif defined( 
     | 
| 
      
 14149 
     | 
    
         
            +
            #elif defined(GGML_USE_CUDA)
         
     | 
| 
       12904 
14150 
     | 
    
         
             
                return GGML_CUDA_MAX_DEVICES;
         
     | 
| 
       12905 
14151 
     | 
    
         
             
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       12906 
14152 
     | 
    
         
             
                return GGML_SYCL_MAX_DEVICES;
         
     | 
| 
         @@ -12920,8 +14166,8 @@ bool llama_supports_mlock(void) { 
     | 
|
| 
       12920 
14166 
     | 
    
         
             
            }
         
     | 
| 
       12921 
14167 
     | 
    
         | 
| 
       12922 
14168 
     | 
    
         
             
            bool llama_supports_gpu_offload(void) {
         
     | 
| 
       12923 
     | 
    
         
            -
            #if defined( 
     | 
| 
       12924 
     | 
    
         
            -
                defined(GGML_USE_SYCL) 
     | 
| 
      
 14169 
     | 
    
         
            +
            #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
         
     | 
| 
      
 14170 
     | 
    
         
            +
                defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
         
     | 
| 
       12925 
14171 
     | 
    
         
             
                // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
         
     | 
| 
       12926 
14172 
     | 
    
         
             
                return true;
         
     | 
| 
       12927 
14173 
     | 
    
         
             
            #else
         
     | 
| 
         @@ -13028,7 +14274,7 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13028 
14274 
     | 
    
         
             
                const auto & hparams = model->hparams;
         
     | 
| 
       13029 
14275 
     | 
    
         
             
                auto       & cparams = ctx->cparams;
         
     | 
| 
       13030 
14276 
     | 
    
         | 
| 
       13031 
     | 
    
         
            -
                 
     | 
| 
      
 14277 
     | 
    
         
            +
                cparams.n_seq_max        = std::max(1u, params.n_seq_max);
         
     | 
| 
       13032 
14278 
     | 
    
         
             
                cparams.n_threads        = params.n_threads;
         
     | 
| 
       13033 
14279 
     | 
    
         
             
                cparams.n_threads_batch  = params.n_threads_batch;
         
     | 
| 
       13034 
14280 
     | 
    
         
             
                cparams.yarn_ext_factor  = params.yarn_ext_factor;
         
     | 
| 
         @@ -13126,7 +14372,7 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13126 
14372 
     | 
    
         
             
                        }
         
     | 
| 
       13127 
14373 
     | 
    
         
             
                        ctx->backends.push_back(ctx->backend_metal);
         
     | 
| 
       13128 
14374 
     | 
    
         
             
                    }
         
     | 
| 
       13129 
     | 
    
         
            -
            #elif defined( 
     | 
| 
      
 14375 
     | 
    
         
            +
            #elif defined(GGML_USE_CUDA)
         
     | 
| 
       13130 
14376 
     | 
    
         
             
                    if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
         
     | 
| 
       13131 
14377 
     | 
    
         
             
                        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         
     | 
| 
       13132 
14378 
     | 
    
         
             
                        ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
         
     | 
| 
         @@ -13149,7 +14395,20 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13149 
14395 
     | 
    
         
             
                        }
         
     | 
| 
       13150 
14396 
     | 
    
         
             
                    }
         
     | 
| 
       13151 
14397 
     | 
    
         
             
            #elif defined(GGML_USE_VULKAN)
         
     | 
| 
       13152 
     | 
    
         
            -
                    if (model-> 
     | 
| 
      
 14398 
     | 
    
         
            +
                    if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
         
     | 
| 
      
 14399 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
         
     | 
| 
      
 14400 
     | 
    
         
            +
                        llama_free(ctx);
         
     | 
| 
      
 14401 
     | 
    
         
            +
                        return nullptr;
         
     | 
| 
      
 14402 
     | 
    
         
            +
                    }
         
     | 
| 
      
 14403 
     | 
    
         
            +
                    if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
         
     | 
| 
      
 14404 
     | 
    
         
            +
                        ggml_backend_t backend = ggml_backend_vk_init(0);
         
     | 
| 
      
 14405 
     | 
    
         
            +
                        if (backend == nullptr) {
         
     | 
| 
      
 14406 
     | 
    
         
            +
                            LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
         
     | 
| 
      
 14407 
     | 
    
         
            +
                            llama_free(ctx);
         
     | 
| 
      
 14408 
     | 
    
         
            +
                            return nullptr;
         
     | 
| 
      
 14409 
     | 
    
         
            +
                        }
         
     | 
| 
      
 14410 
     | 
    
         
            +
                        ctx->backends.push_back(backend);
         
     | 
| 
      
 14411 
     | 
    
         
            +
                    } else {
         
     | 
| 
       13153 
14412 
     | 
    
         
             
                        for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
         
     | 
| 
       13154 
14413 
     | 
    
         
             
                            ggml_backend_t backend = ggml_backend_vk_init(device);
         
     | 
| 
       13155 
14414 
     | 
    
         
             
                            if (backend == nullptr) {
         
     | 
| 
         @@ -13161,30 +14420,28 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13161 
14420 
     | 
    
         
             
                        }
         
     | 
| 
       13162 
14421 
     | 
    
         
             
                    }
         
     | 
| 
       13163 
14422 
     | 
    
         
             
            #elif defined(GGML_USE_SYCL)
         
     | 
| 
       13164 
     | 
    
         
            -
                     
     | 
| 
       13165 
     | 
    
         
            -
             
     | 
| 
       13166 
     | 
    
         
            -
                         
     | 
| 
       13167 
     | 
    
         
            -
             
     | 
| 
      
 14423 
     | 
    
         
            +
                    // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         
     | 
| 
      
 14424 
     | 
    
         
            +
                    if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
         
     | 
| 
      
 14425 
     | 
    
         
            +
                        ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
         
     | 
| 
      
 14426 
     | 
    
         
            +
                        if (backend == nullptr) {
         
     | 
| 
      
 14427 
     | 
    
         
            +
                            int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
         
     | 
| 
      
 14428 
     | 
    
         
            +
                            LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
         
     | 
| 
      
 14429 
     | 
    
         
            +
                            llama_free(ctx);
         
     | 
| 
      
 14430 
     | 
    
         
            +
                            return nullptr;
         
     | 
| 
      
 14431 
     | 
    
         
            +
                        }
         
     | 
| 
      
 14432 
     | 
    
         
            +
                        ctx->backends.push_back(backend);
         
     | 
| 
      
 14433 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 14434 
     | 
    
         
            +
                        // LLAMA_SPLIT_LAYER requires a backend for each GPU
         
     | 
| 
      
 14435 
     | 
    
         
            +
                        for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
         
     | 
| 
      
 14436 
     | 
    
         
            +
                            ggml_backend_t backend = ggml_backend_sycl_init(i);
         
     | 
| 
       13168 
14437 
     | 
    
         
             
                            if (backend == nullptr) {
         
     | 
| 
       13169 
     | 
    
         
            -
                                int  
     | 
| 
       13170 
     | 
    
         
            -
                                 
     | 
| 
      
 14438 
     | 
    
         
            +
                                int id_list[GGML_SYCL_MAX_DEVICES];
         
     | 
| 
      
 14439 
     | 
    
         
            +
                                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
         
     | 
| 
      
 14440 
     | 
    
         
            +
                                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
         
     | 
| 
       13171 
14441 
     | 
    
         
             
                                llama_free(ctx);
         
     | 
| 
       13172 
14442 
     | 
    
         
             
                                return nullptr;
         
     | 
| 
       13173 
14443 
     | 
    
         
             
                            }
         
     | 
| 
       13174 
14444 
     | 
    
         
             
                            ctx->backends.push_back(backend);
         
     | 
| 
       13175 
     | 
    
         
            -
                        } else {
         
     | 
| 
       13176 
     | 
    
         
            -
                            // LLAMA_SPLIT_LAYER requires a backend for each GPU
         
     | 
| 
       13177 
     | 
    
         
            -
                            for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
         
     | 
| 
       13178 
     | 
    
         
            -
                                ggml_backend_t backend = ggml_backend_sycl_init(i);
         
     | 
| 
       13179 
     | 
    
         
            -
                                if (backend == nullptr) {
         
     | 
| 
       13180 
     | 
    
         
            -
                                    int id_list[GGML_SYCL_MAX_DEVICES];
         
     | 
| 
       13181 
     | 
    
         
            -
                                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
         
     | 
| 
       13182 
     | 
    
         
            -
                                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
         
     | 
| 
       13183 
     | 
    
         
            -
                                    llama_free(ctx);
         
     | 
| 
       13184 
     | 
    
         
            -
                                    return nullptr;
         
     | 
| 
       13185 
     | 
    
         
            -
                                }
         
     | 
| 
       13186 
     | 
    
         
            -
                                ctx->backends.push_back(backend);
         
     | 
| 
       13187 
     | 
    
         
            -
                            }
         
     | 
| 
       13188 
14445 
     | 
    
         
             
                        }
         
     | 
| 
       13189 
14446 
     | 
    
         
             
                    }
         
     | 
| 
       13190 
14447 
     | 
    
         
             
            #elif defined(GGML_USE_KOMPUTE)
         
     | 
| 
         @@ -13232,25 +14489,12 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13232 
14489 
     | 
    
         | 
| 
       13233 
14490 
     | 
    
         
             
                    // graph outputs buffer
         
     | 
| 
       13234 
14491 
     | 
    
         
             
                    {
         
     | 
| 
       13235 
     | 
    
         
            -
                        // resized during inference 
     | 
| 
       13236 
     | 
    
         
            -
                        ctx 
     | 
| 
       13237 
     | 
    
         
            -
             
     | 
| 
       13238 
     | 
    
         
            -
             
     | 
| 
       13239 
     | 
    
         
            -
                        const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
         
     | 
| 
       13240 
     | 
    
         
            -
             
     | 
| 
       13241 
     | 
    
         
            -
                        ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size);
         
     | 
| 
       13242 
     | 
    
         
            -
                        if (ctx->buf_output == nullptr) {
         
     | 
| 
       13243 
     | 
    
         
            -
                            LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__);
         
     | 
| 
      
 14492 
     | 
    
         
            +
                        // resized during inference when a batch uses more outputs
         
     | 
| 
      
 14493 
     | 
    
         
            +
                        if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
         
     | 
| 
      
 14494 
     | 
    
         
            +
                            LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
         
     | 
| 
       13244 
14495 
     | 
    
         
             
                            llama_free(ctx);
         
     | 
| 
       13245 
14496 
     | 
    
         
             
                            return nullptr;
         
     | 
| 
       13246 
14497 
     | 
    
         
             
                        }
         
     | 
| 
       13247 
     | 
    
         
            -
                        ggml_backend_buffer_clear(ctx->buf_output, 0);
         
     | 
| 
       13248 
     | 
    
         
            -
             
     | 
| 
       13249 
     | 
    
         
            -
             
     | 
| 
       13250 
     | 
    
         
            -
                        ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output);
         
     | 
| 
       13251 
     | 
    
         
            -
                        if (params.embeddings) {
         
     | 
| 
       13252 
     | 
    
         
            -
                            ctx->embd = ctx->logits + ctx->logits_size;
         
     | 
| 
       13253 
     | 
    
         
            -
                        }
         
     | 
| 
       13254 
14498 
     | 
    
         | 
| 
       13255 
14499 
     | 
    
         
             
                        LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
         
     | 
| 
       13256 
14500 
     | 
    
         
             
                                ggml_backend_buffer_name(ctx->buf_output),
         
     | 
| 
         @@ -13275,7 +14519,7 @@ struct llama_context * llama_new_context_with_model( 
     | 
|
| 
       13275 
14519 
     | 
    
         | 
| 
       13276 
14520 
     | 
    
         
             
                        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
         
     | 
| 
       13277 
14521 
     | 
    
         
             
                        bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
         
     | 
| 
       13278 
     | 
    
         
            -
            #ifndef  
     | 
| 
      
 14522 
     | 
    
         
            +
            #ifndef GGML_USE_CUDA
         
     | 
| 
       13279 
14523 
     | 
    
         
             
                        // pipeline parallelism requires support for async compute and events
         
     | 
| 
       13280 
14524 
     | 
    
         
             
                        // currently this is only implemented in the CUDA backend
         
     | 
| 
       13281 
14525 
     | 
    
         
             
                        pipeline_parallel = false;
         
     | 
| 
         @@ -13383,11 +14627,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { 
     | 
|
| 
       13383 
14627 
     | 
    
         
             
                    case LLM_ARCH_ORION:
         
     | 
| 
       13384 
14628 
     | 
    
         
             
                    case LLM_ARCH_INTERNLM2:
         
     | 
| 
       13385 
14629 
     | 
    
         
             
                    case LLM_ARCH_MINICPM:
         
     | 
| 
      
 14630 
     | 
    
         
            +
                    case LLM_ARCH_XVERSE:
         
     | 
| 
       13386 
14631 
     | 
    
         
             
                    case LLM_ARCH_COMMAND_R:
         
     | 
| 
       13387 
14632 
     | 
    
         
             
                        return LLAMA_ROPE_TYPE_NORM;
         
     | 
| 
       13388 
14633 
     | 
    
         | 
| 
       13389 
14634 
     | 
    
         
             
                    // the pairs of head values are offset by n_rot/2
         
     | 
| 
       13390 
14635 
     | 
    
         
             
                    case LLM_ARCH_FALCON:
         
     | 
| 
      
 14636 
     | 
    
         
            +
                    case LLM_ARCH_GROK:
         
     | 
| 
       13391 
14637 
     | 
    
         
             
                    case LLM_ARCH_PERSIMMON:
         
     | 
| 
       13392 
14638 
     | 
    
         
             
                    case LLM_ARCH_BERT:
         
     | 
| 
       13393 
14639 
     | 
    
         
             
                    case LLM_ARCH_NOMIC_BERT:
         
     | 
| 
         @@ -13763,30 +15009,60 @@ void llama_kv_cache_update(struct llama_context * ctx) { 
     | 
|
| 
       13763 
15009 
     | 
    
         
             
                llama_kv_cache_update_internal(*ctx);
         
     | 
| 
       13764 
15010 
     | 
    
         
             
            }
         
     | 
| 
       13765 
15011 
     | 
    
         | 
| 
      
 15012 
     | 
    
         
            +
            // deprecated
         
     | 
| 
      
 15013 
     | 
    
         
            +
            size_t llama_get_state_size(const struct llama_context * ctx) {
         
     | 
| 
      
 15014 
     | 
    
         
            +
                return llama_state_get_size(ctx);
         
     | 
| 
      
 15015 
     | 
    
         
            +
            }
         
     | 
| 
      
 15016 
     | 
    
         
            +
             
     | 
| 
      
 15017 
     | 
    
         
            +
            // deprecated
         
     | 
| 
      
 15018 
     | 
    
         
            +
            size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
         
     | 
| 
      
 15019 
     | 
    
         
            +
                return llama_state_get_data(ctx, dst);
         
     | 
| 
      
 15020 
     | 
    
         
            +
            }
         
     | 
| 
      
 15021 
     | 
    
         
            +
             
     | 
| 
      
 15022 
     | 
    
         
            +
            // deprecated
         
     | 
| 
      
 15023 
     | 
    
         
            +
            size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
         
     | 
| 
      
 15024 
     | 
    
         
            +
                return llama_state_set_data(ctx, src);
         
     | 
| 
      
 15025 
     | 
    
         
            +
            }
         
     | 
| 
      
 15026 
     | 
    
         
            +
             
     | 
| 
      
 15027 
     | 
    
         
            +
            // deprecated
         
     | 
| 
      
 15028 
     | 
    
         
            +
            bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
         
     | 
| 
      
 15029 
     | 
    
         
            +
                return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
         
     | 
| 
      
 15030 
     | 
    
         
            +
            }
         
     | 
| 
      
 15031 
     | 
    
         
            +
             
     | 
| 
      
 15032 
     | 
    
         
            +
            // deprecated
         
     | 
| 
      
 15033 
     | 
    
         
            +
            bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
         
     | 
| 
      
 15034 
     | 
    
         
            +
                return llama_state_save_file(ctx, path_session, tokens, n_token_count);
         
     | 
| 
      
 15035 
     | 
    
         
            +
            }
         
     | 
| 
       13766 
15036 
     | 
    
         | 
| 
       13767 
15037 
     | 
    
         
             
            // Returns the *maximum* size of the state
         
     | 
| 
       13768 
     | 
    
         
            -
            size_t  
     | 
| 
      
 15038 
     | 
    
         
            +
            size_t llama_state_get_size(const struct llama_context * ctx) {
         
     | 
| 
      
 15039 
     | 
    
         
            +
                const auto & cparams = ctx->cparams;
         
     | 
| 
      
 15040 
     | 
    
         
            +
                const auto & hparams = ctx->model.hparams;
         
     | 
| 
      
 15041 
     | 
    
         
            +
             
     | 
| 
       13769 
15042 
     | 
    
         
             
                // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
         
     | 
| 
       13770 
15043 
     | 
    
         
             
                // for reference, std::mt19937(1337) serializes to 6701 bytes.
         
     | 
| 
       13771 
15044 
     | 
    
         
             
                const size_t s_rng_size        = sizeof(size_t);
         
     | 
| 
       13772 
15045 
     | 
    
         
             
                const size_t s_rng             = LLAMA_MAX_RNG_STATE;
         
     | 
| 
      
 15046 
     | 
    
         
            +
                const size_t s_n_outputs       = sizeof(size_t);
         
     | 
| 
      
 15047 
     | 
    
         
            +
                // assume worst case for outputs although only currently set ones are serialized
         
     | 
| 
      
 15048 
     | 
    
         
            +
                const size_t s_output_pos      = ctx->cparams.n_batch * sizeof(int32_t);
         
     | 
| 
       13773 
15049 
     | 
    
         
             
                const size_t s_logits_size     = sizeof(size_t);
         
     | 
| 
       13774 
     | 
    
         
            -
                 
     | 
| 
       13775 
     | 
    
         
            -
                const size_t s_logits          = ctx->logits_size * sizeof(float);
         
     | 
| 
      
 15050 
     | 
    
         
            +
                const size_t s_logits          = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
         
     | 
| 
       13776 
15051 
     | 
    
         
             
                const size_t s_embedding_size  = sizeof(size_t);
         
     | 
| 
       13777 
     | 
    
         
            -
                const size_t s_embedding       = ctx->embd_size * sizeof(float);
         
     | 
| 
      
 15052 
     | 
    
         
            +
                const size_t s_embedding       = ctx->embd_size   ? cparams.n_batch * hparams.n_embd  * sizeof(float) : 0;
         
     | 
| 
       13778 
15053 
     | 
    
         
             
                const size_t s_kv_buf_size     = sizeof(size_t);
         
     | 
| 
       13779 
15054 
     | 
    
         
             
                const size_t s_kv_head         = sizeof(uint32_t);
         
     | 
| 
       13780 
15055 
     | 
    
         
             
                const size_t s_kv_size         = sizeof(uint32_t);
         
     | 
| 
       13781 
15056 
     | 
    
         
             
                const size_t s_kv_used         = sizeof(uint32_t);
         
     | 
| 
       13782 
15057 
     | 
    
         
             
                const size_t s_kv              = ctx->kv_self.total_size();
         
     | 
| 
       13783 
     | 
    
         
            -
                 
     | 
| 
       13784 
     | 
    
         
            -
                const size_t s_kv_cell         = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
         
     | 
| 
      
 15058 
     | 
    
         
            +
                const size_t s_kv_cell         = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
         
     | 
| 
       13785 
15059 
     | 
    
         
             
                const size_t s_kv_cells        = ctx->kv_self.size * s_kv_cell;
         
     | 
| 
       13786 
15060 
     | 
    
         | 
| 
       13787 
15061 
     | 
    
         
             
                const size_t s_total = (
         
     | 
| 
       13788 
15062 
     | 
    
         
             
                    + s_rng_size
         
     | 
| 
       13789 
15063 
     | 
    
         
             
                    + s_rng
         
     | 
| 
      
 15064 
     | 
    
         
            +
                    + s_n_outputs
         
     | 
| 
      
 15065 
     | 
    
         
            +
                    + s_output_pos
         
     | 
| 
       13790 
15066 
     | 
    
         
             
                    + s_logits_size
         
     | 
| 
       13791 
15067 
     | 
    
         
             
                    + s_logits
         
     | 
| 
       13792 
15068 
     | 
    
         
             
                    + s_embedding_size
         
     | 
| 
         @@ -13847,21 +15123,21 @@ struct llama_data_file_context : llama_data_context { 
     | 
|
| 
       13847 
15123 
     | 
    
         
             
             * file context:
         
     | 
| 
       13848 
15124 
     | 
    
         
             
             * llama_file file("/path", "wb");
         
     | 
| 
       13849 
15125 
     | 
    
         
             
             * llama_data_file_context data_ctx(&file);
         
     | 
| 
       13850 
     | 
    
         
            -
             *  
     | 
| 
      
 15126 
     | 
    
         
            +
             * llama_state_get_data(ctx, &data_ctx);
         
     | 
| 
       13851 
15127 
     | 
    
         
             
             *
         
     | 
| 
       13852 
15128 
     | 
    
         
             
             * buffer context:
         
     | 
| 
       13853 
15129 
     | 
    
         
             
             * std::vector<uint8_t> buf(max_size, 0);
         
     | 
| 
       13854 
15130 
     | 
    
         
             
             * llama_data_buffer_context data_ctx(&buf.data());
         
     | 
| 
       13855 
     | 
    
         
            -
             *  
     | 
| 
      
 15131 
     | 
    
         
            +
             * llama_state_get_data(ctx, &data_ctx);
         
     | 
| 
       13856 
15132 
     | 
    
         
             
             *
         
     | 
| 
       13857 
15133 
     | 
    
         
             
            */
         
     | 
| 
       13858 
     | 
    
         
            -
            static void  
     | 
| 
      
 15134 
     | 
    
         
            +
            static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
         
     | 
| 
       13859 
15135 
     | 
    
         
             
                // copy rng
         
     | 
| 
       13860 
15136 
     | 
    
         
             
                {
         
     | 
| 
       13861 
15137 
     | 
    
         
             
                    std::ostringstream rng_ss;
         
     | 
| 
       13862 
15138 
     | 
    
         
             
                    rng_ss << ctx->rng;
         
     | 
| 
       13863 
15139 
     | 
    
         | 
| 
       13864 
     | 
    
         
            -
                    const std::string & rng_str 
     | 
| 
      
 15140 
     | 
    
         
            +
                    const std::string & rng_str  = rng_ss.str();
         
     | 
| 
       13865 
15141 
     | 
    
         
             
                    const size_t        rng_size = rng_str.size();
         
     | 
| 
       13866 
15142 
     | 
    
         | 
| 
       13867 
15143 
     | 
    
         
             
                    GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
         
     | 
| 
         @@ -13870,25 +15146,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat 
     | 
|
| 
       13870 
15146 
     | 
    
         
             
                    data_ctx->write(rng_str.data(), rng_size);
         
     | 
| 
       13871 
15147 
     | 
    
         
             
                }
         
     | 
| 
       13872 
15148 
     | 
    
         | 
| 
       13873 
     | 
    
         
            -
                // copy  
     | 
| 
      
 15149 
     | 
    
         
            +
                // copy outputs
         
     | 
| 
       13874 
15150 
     | 
    
         
             
                {
         
     | 
| 
       13875 
     | 
    
         
            -
                     
     | 
| 
      
 15151 
     | 
    
         
            +
                    // Can't use ctx->n_outputs because it's not for the
         
     | 
| 
      
 15152 
     | 
    
         
            +
                    // entire last batch when n_ubatch is smaller than n_batch
         
     | 
| 
      
 15153 
     | 
    
         
            +
                    size_t n_outputs = 0;
         
     | 
| 
      
 15154 
     | 
    
         
            +
             
     | 
| 
      
 15155 
     | 
    
         
            +
                    // copy output ids
         
     | 
| 
      
 15156 
     | 
    
         
            +
                    {
         
     | 
| 
      
 15157 
     | 
    
         
            +
                        std::vector<int32_t> output_pos;
         
     | 
| 
       13876 
15158 
     | 
    
         | 
| 
       13877 
     | 
    
         
            -
             
     | 
| 
      
 15159 
     | 
    
         
            +
                        const size_t    n_batch = ctx->cparams.n_batch;
         
     | 
| 
      
 15160 
     | 
    
         
            +
                        const auto & output_ids = ctx->output_ids;
         
     | 
| 
       13878 
15161 
     | 
    
         | 
| 
       13879 
     | 
    
         
            -
             
     | 
| 
       13880 
     | 
    
         
            -
             
     | 
| 
      
 15162 
     | 
    
         
            +
                        output_pos.resize(ctx->output_size);
         
     | 
| 
      
 15163 
     | 
    
         
            +
             
     | 
| 
      
 15164 
     | 
    
         
            +
                        // build a more compact representation of the output ids
         
     | 
| 
      
 15165 
     | 
    
         
            +
                        for (size_t i = 0; i < n_batch; ++i) {
         
     | 
| 
      
 15166 
     | 
    
         
            +
                            // map an output id to a position in the batch
         
     | 
| 
      
 15167 
     | 
    
         
            +
                            int32_t pos = output_ids[i];
         
     | 
| 
      
 15168 
     | 
    
         
            +
                            if (pos >= 0) {
         
     | 
| 
      
 15169 
     | 
    
         
            +
                                if ((size_t) pos >= n_outputs) {
         
     | 
| 
      
 15170 
     | 
    
         
            +
                                    n_outputs = pos + 1;
         
     | 
| 
      
 15171 
     | 
    
         
            +
                                }
         
     | 
| 
      
 15172 
     | 
    
         
            +
                                GGML_ASSERT((size_t) pos < ctx->output_size);
         
     | 
| 
      
 15173 
     | 
    
         
            +
                                output_pos[pos] = i;
         
     | 
| 
      
 15174 
     | 
    
         
            +
                            }
         
     | 
| 
      
 15175 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15176 
     | 
    
         
            +
             
     | 
| 
      
 15177 
     | 
    
         
            +
                        data_ctx->write(&n_outputs, sizeof(n_outputs));
         
     | 
| 
      
 15178 
     | 
    
         
            +
             
     | 
| 
      
 15179 
     | 
    
         
            +
                        if (n_outputs) {
         
     | 
| 
      
 15180 
     | 
    
         
            +
                            data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
         
     | 
| 
      
 15181 
     | 
    
         
            +
                        }
         
     | 
| 
       13881 
15182 
     | 
    
         
             
                    }
         
     | 
| 
       13882 
     | 
    
         
            -
                }
         
     | 
| 
       13883 
15183 
     | 
    
         | 
| 
       13884 
     | 
    
         
            -
             
     | 
| 
       13885 
     | 
    
         
            -
             
     | 
| 
       13886 
     | 
    
         
            -
             
     | 
| 
      
 15184 
     | 
    
         
            +
                    // copy logits
         
     | 
| 
      
 15185 
     | 
    
         
            +
                    {
         
     | 
| 
      
 15186 
     | 
    
         
            +
                        const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
         
     | 
| 
       13887 
15187 
     | 
    
         | 
| 
       13888 
     | 
    
         
            -
             
     | 
| 
      
 15188 
     | 
    
         
            +
                        data_ctx->write(&logits_size, sizeof(logits_size));
         
     | 
| 
       13889 
15189 
     | 
    
         | 
| 
       13890 
     | 
    
         
            -
             
     | 
| 
       13891 
     | 
    
         
            -
             
     | 
| 
      
 15190 
     | 
    
         
            +
                        if (logits_size) {
         
     | 
| 
      
 15191 
     | 
    
         
            +
                            data_ctx->write(ctx->logits, logits_size * sizeof(float));
         
     | 
| 
      
 15192 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15193 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15194 
     | 
    
         
            +
             
     | 
| 
      
 15195 
     | 
    
         
            +
                    // copy embeddings
         
     | 
| 
      
 15196 
     | 
    
         
            +
                    {
         
     | 
| 
      
 15197 
     | 
    
         
            +
                        const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
         
     | 
| 
      
 15198 
     | 
    
         
            +
             
     | 
| 
      
 15199 
     | 
    
         
            +
                        data_ctx->write(&embeddings_size, sizeof(embeddings_size));
         
     | 
| 
      
 15200 
     | 
    
         
            +
             
     | 
| 
      
 15201 
     | 
    
         
            +
                        if (embeddings_size) {
         
     | 
| 
      
 15202 
     | 
    
         
            +
                            data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
         
     | 
| 
      
 15203 
     | 
    
         
            +
                        }
         
     | 
| 
       13892 
15204 
     | 
    
         
             
                    }
         
     | 
| 
       13893 
15205 
     | 
    
         
             
                }
         
     | 
| 
       13894 
15206 
     | 
    
         | 
| 
         @@ -13901,9 +15213,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat 
     | 
|
| 
       13901 
15213 
     | 
    
         
             
                    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
         
     | 
| 
       13902 
15214 
     | 
    
         
             
                    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
         
     | 
| 
       13903 
15215 
     | 
    
         | 
| 
       13904 
     | 
    
         
            -
                     
     | 
| 
      
 15216 
     | 
    
         
            +
                    // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
         
     | 
| 
       13905 
15217 
     | 
    
         
             
                    const uint32_t kv_head     = llama_kv_cache_cell_max(kv_self);
         
     | 
| 
       13906 
15218 
     | 
    
         
             
                    const uint32_t kv_size     = kv_self.size;
         
     | 
| 
      
 15219 
     | 
    
         
            +
                    const size_t   kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
         
     | 
| 
       13907 
15220 
     | 
    
         
             
                    const uint32_t kv_used     = kv_self.used;
         
     | 
| 
       13908 
15221 
     | 
    
         | 
| 
       13909 
15222 
     | 
    
         
             
                    data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
         
     | 
| 
         @@ -13912,6 +15225,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat 
     | 
|
| 
       13912 
15225 
     | 
    
         
             
                    data_ctx->write(&kv_used,     sizeof(kv_used));
         
     | 
| 
       13913 
15226 
     | 
    
         | 
| 
       13914 
15227 
     | 
    
         
             
                    if (kv_buf_size) {
         
     | 
| 
      
 15228 
     | 
    
         
            +
                        const size_t pre_kv_buf_size = data_ctx->get_size_written();
         
     | 
| 
      
 15229 
     | 
    
         
            +
             
     | 
| 
       13915 
15230 
     | 
    
         
             
                        std::vector<uint8_t> tmp_buf;
         
     | 
| 
       13916 
15231 
     | 
    
         
             
                        for (int il = 0; il < (int) n_layer; ++il) {
         
     | 
| 
       13917 
15232 
     | 
    
         
             
                            const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
         
     | 
| 
         @@ -13941,6 +15256,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat 
     | 
|
| 
       13941 
15256 
     | 
    
         
             
                                data_ctx->write(tmp_buf.data(), tmp_buf.size());
         
     | 
| 
       13942 
15257 
     | 
    
         
             
                            }
         
     | 
| 
       13943 
15258 
     | 
    
         
             
                        }
         
     | 
| 
      
 15259 
     | 
    
         
            +
                        GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
         
     | 
| 
       13944 
15260 
     | 
    
         
             
                    }
         
     | 
| 
       13945 
15261 
     | 
    
         | 
| 
       13946 
15262 
     | 
    
         
             
                    for (uint32_t i = 0; i < kv_head; ++i) {
         
     | 
| 
         @@ -13959,15 +15275,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat 
     | 
|
| 
       13959 
15275 
     | 
    
         
             
                }
         
     | 
| 
       13960 
15276 
     | 
    
         
             
            }
         
     | 
| 
       13961 
15277 
     | 
    
         | 
| 
       13962 
     | 
    
         
            -
            size_t  
     | 
| 
      
 15278 
     | 
    
         
            +
            size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
         
     | 
| 
       13963 
15279 
     | 
    
         
             
                llama_data_buffer_context data_ctx(dst);
         
     | 
| 
       13964 
     | 
    
         
            -
                 
     | 
| 
      
 15280 
     | 
    
         
            +
                llama_state_get_data_internal(ctx, &data_ctx);
         
     | 
| 
       13965 
15281 
     | 
    
         | 
| 
       13966 
15282 
     | 
    
         
             
                return data_ctx.get_size_written();
         
     | 
| 
       13967 
15283 
     | 
    
         
             
            }
         
     | 
| 
       13968 
15284 
     | 
    
         | 
| 
       13969 
15285 
     | 
    
         
             
            // Sets the state reading from the specified source address
         
     | 
| 
       13970 
     | 
    
         
            -
            size_t  
     | 
| 
      
 15286 
     | 
    
         
            +
            size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
         
     | 
| 
       13971 
15287 
     | 
    
         
             
                const uint8_t * inp = src;
         
     | 
| 
       13972 
15288 
     | 
    
         | 
| 
       13973 
15289 
     | 
    
         
             
                // set rng
         
     | 
| 
         @@ -13985,6 +15301,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { 
     | 
|
| 
       13985 
15301 
     | 
    
         
             
                    GGML_ASSERT(!rng_ss.fail());
         
     | 
| 
       13986 
15302 
     | 
    
         
             
                }
         
     | 
| 
       13987 
15303 
     | 
    
         | 
| 
      
 15304 
     | 
    
         
            +
                // set output ids
         
     | 
| 
      
 15305 
     | 
    
         
            +
                {
         
     | 
| 
      
 15306 
     | 
    
         
            +
                    size_t n_outputs;
         
     | 
| 
      
 15307 
     | 
    
         
            +
                    std::vector<int32_t> output_pos;
         
     | 
| 
      
 15308 
     | 
    
         
            +
             
     | 
| 
      
 15309 
     | 
    
         
            +
                    memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
         
     | 
| 
      
 15310 
     | 
    
         
            +
             
     | 
| 
      
 15311 
     | 
    
         
            +
                    GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
         
     | 
| 
      
 15312 
     | 
    
         
            +
             
     | 
| 
      
 15313 
     | 
    
         
            +
                    if (n_outputs) {
         
     | 
| 
      
 15314 
     | 
    
         
            +
                        output_pos.resize(n_outputs);
         
     | 
| 
      
 15315 
     | 
    
         
            +
                        memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
         
     | 
| 
      
 15316 
     | 
    
         
            +
                        inp += n_outputs * sizeof(int32_t);
         
     | 
| 
      
 15317 
     | 
    
         
            +
             
     | 
| 
      
 15318 
     | 
    
         
            +
                        for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
         
     | 
| 
      
 15319 
     | 
    
         
            +
                            int32_t id = output_pos[i];
         
     | 
| 
      
 15320 
     | 
    
         
            +
                            GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
         
     | 
| 
      
 15321 
     | 
    
         
            +
                            ctx->output_ids[id] = i;
         
     | 
| 
      
 15322 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15323 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15324 
     | 
    
         
            +
                }
         
     | 
| 
      
 15325 
     | 
    
         
            +
             
     | 
| 
       13988 
15326 
     | 
    
         
             
                // set logits
         
     | 
| 
       13989 
15327 
     | 
    
         
             
                {
         
     | 
| 
       13990 
15328 
     | 
    
         
             
                    size_t logits_size;
         
     | 
| 
         @@ -14005,7 +15343,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { 
     | 
|
| 
       14005 
15343 
     | 
    
         | 
| 
       14006 
15344 
     | 
    
         
             
                    memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
         
     | 
| 
       14007 
15345 
     | 
    
         | 
| 
       14008 
     | 
    
         
            -
                    GGML_ASSERT(ctx->embd_size  
     | 
| 
      
 15346 
     | 
    
         
            +
                    GGML_ASSERT(ctx->embd_size >= embeddings_size);
         
     | 
| 
       14009 
15347 
     | 
    
         | 
| 
       14010 
15348 
     | 
    
         
             
                    if (embeddings_size) {
         
     | 
| 
       14011 
15349 
     | 
    
         
             
                        memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
         
     | 
| 
         @@ -14032,8 +15370,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { 
     | 
|
| 
       14032 
15370 
     | 
    
         
             
                    memcpy(&kv_size,     inp, sizeof(kv_size));     inp += sizeof(kv_size);
         
     | 
| 
       14033 
15371 
     | 
    
         
             
                    memcpy(&kv_used,     inp, sizeof(kv_used));     inp += sizeof(kv_used);
         
     | 
| 
       14034 
15372 
     | 
    
         | 
| 
      
 15373 
     | 
    
         
            +
                    if (kv_self.size != kv_size) {
         
     | 
| 
      
 15374 
     | 
    
         
            +
                        // the KV cache needs to be big enough to load all the KV cells from the saved state
         
     | 
| 
      
 15375 
     | 
    
         
            +
                        GGML_ASSERT(kv_self.size >= kv_head);
         
     | 
| 
      
 15376 
     | 
    
         
            +
             
     | 
| 
      
 15377 
     | 
    
         
            +
                        LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
         
     | 
| 
      
 15378 
     | 
    
         
            +
                            __func__, kv_head, kv_size, kv_self.size);
         
     | 
| 
      
 15379 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15380 
     | 
    
         
            +
             
     | 
| 
       14035 
15381 
     | 
    
         
             
                    if (kv_buf_size) {
         
     | 
| 
       14036 
     | 
    
         
            -
                         
     | 
| 
      
 15382 
     | 
    
         
            +
                        const size_t pre_kv_buf_size = inp - src;
         
     | 
| 
      
 15383 
     | 
    
         
            +
             
     | 
| 
      
 15384 
     | 
    
         
            +
                        GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
         
     | 
| 
       14037 
15385 
     | 
    
         | 
| 
       14038 
15386 
     | 
    
         
             
                        for (int il = 0; il < (int) n_layer; ++il) {
         
     | 
| 
       14039 
15387 
     | 
    
         
             
                            const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
         
     | 
| 
         @@ -14053,23 +15401,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { 
     | 
|
| 
       14053 
15401 
     | 
    
         | 
| 
       14054 
15402 
     | 
    
         
             
                            // v is not contiguous, copy row by row
         
     | 
| 
       14055 
15403 
     | 
    
         
             
                            const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
         
     | 
| 
       14056 
     | 
    
         
            -
                            const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type,  
     | 
| 
      
 15404 
     | 
    
         
            +
                            const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
         
     | 
| 
       14057 
15405 
     | 
    
         | 
| 
       14058 
15406 
     | 
    
         
             
                            for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
         
     | 
| 
       14059 
15407 
     | 
    
         
             
                                ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
         
     | 
| 
       14060 
15408 
     | 
    
         
             
                                inp += v_row_size;
         
     | 
| 
       14061 
15409 
     | 
    
         
             
                            }
         
     | 
| 
       14062 
15410 
     | 
    
         
             
                        }
         
     | 
| 
      
 15411 
     | 
    
         
            +
                        GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
         
     | 
| 
       14063 
15412 
     | 
    
         
             
                    }
         
     | 
| 
       14064 
15413 
     | 
    
         | 
| 
       14065 
     | 
    
         
            -
                     
     | 
| 
      
 15414 
     | 
    
         
            +
                    llama_kv_cache_clear(ctx);
         
     | 
| 
       14066 
15415 
     | 
    
         | 
| 
       14067 
15416 
     | 
    
         
             
                    ctx->kv_self.head = kv_head;
         
     | 
| 
       14068 
     | 
    
         
            -
                    ctx->kv_self.size = kv_size;
         
     | 
| 
       14069 
15417 
     | 
    
         
             
                    ctx->kv_self.used = kv_used;
         
     | 
| 
       14070 
15418 
     | 
    
         | 
| 
       14071 
     | 
    
         
            -
                    ctx->kv_self.cells.resize(kv_size);
         
     | 
| 
       14072 
     | 
    
         
            -
             
     | 
| 
       14073 
15419 
     | 
    
         
             
                    for (uint32_t i = 0; i < kv_head; ++i) {
         
     | 
| 
       14074 
15420 
     | 
    
         
             
                        llama_pos pos;
         
     | 
| 
       14075 
15421 
     | 
    
         
             
                        size_t    seq_id_size;
         
     | 
| 
         @@ -14086,22 +15432,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { 
     | 
|
| 
       14086 
15432 
     | 
    
         
             
                            ctx->kv_self.cells[i].seq_id.insert(seq_id);
         
     | 
| 
       14087 
15433 
     | 
    
         
             
                        }
         
     | 
| 
       14088 
15434 
     | 
    
         
             
                    }
         
     | 
| 
       14089 
     | 
    
         
            -
             
     | 
| 
       14090 
     | 
    
         
            -
                    for (uint32_t i = kv_head; i < kv_size; ++i) {
         
     | 
| 
       14091 
     | 
    
         
            -
                        ctx->kv_self.cells[i].pos = -1;
         
     | 
| 
       14092 
     | 
    
         
            -
                        ctx->kv_self.cells[i].seq_id.clear();
         
     | 
| 
       14093 
     | 
    
         
            -
                    }
         
     | 
| 
       14094 
15435 
     | 
    
         
             
                }
         
     | 
| 
       14095 
15436 
     | 
    
         | 
| 
       14096 
15437 
     | 
    
         
             
                const size_t nread    = inp - src;
         
     | 
| 
       14097 
     | 
    
         
            -
                const size_t max_size =  
     | 
| 
      
 15438 
     | 
    
         
            +
                const size_t max_size = llama_state_get_size(ctx);
         
     | 
| 
       14098 
15439 
     | 
    
         | 
| 
       14099 
15440 
     | 
    
         
             
                GGML_ASSERT(nread <= max_size);
         
     | 
| 
       14100 
15441 
     | 
    
         | 
| 
       14101 
15442 
     | 
    
         
             
                return nread;
         
     | 
| 
       14102 
15443 
     | 
    
         
             
            }
         
     | 
| 
       14103 
15444 
     | 
    
         | 
| 
       14104 
     | 
    
         
            -
            static bool  
     | 
| 
      
 15445 
     | 
    
         
            +
            static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
         
     | 
| 
       14105 
15446 
     | 
    
         
             
                llama_file file(path_session, "rb");
         
     | 
| 
       14106 
15447 
     | 
    
         | 
| 
       14107 
15448 
     | 
    
         
             
                // sanity checks
         
     | 
| 
         @@ -14139,7 +15480,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c 
     | 
|
| 
       14139 
15480 
     | 
    
         
             
                // restore the context state
         
     | 
| 
       14140 
15481 
     | 
    
         
             
                {
         
     | 
| 
       14141 
15482 
     | 
    
         
             
                    const size_t n_state_size_cur = file.size - file.tell();
         
     | 
| 
       14142 
     | 
    
         
            -
                    const size_t n_state_size_max =  
     | 
| 
      
 15483 
     | 
    
         
            +
                    const size_t n_state_size_max = llama_state_get_size(ctx);
         
     | 
| 
       14143 
15484 
     | 
    
         | 
| 
       14144 
15485 
     | 
    
         
             
                    if (n_state_size_cur > n_state_size_max) {
         
     | 
| 
       14145 
15486 
     | 
    
         
             
                        LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
         
     | 
| 
         @@ -14149,22 +15490,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c 
     | 
|
| 
       14149 
15490 
     | 
    
         
             
                    std::vector<uint8_t> state_data(n_state_size_max);
         
     | 
| 
       14150 
15491 
     | 
    
         
             
                    file.read_raw(state_data.data(), n_state_size_cur);
         
     | 
| 
       14151 
15492 
     | 
    
         | 
| 
       14152 
     | 
    
         
            -
                     
     | 
| 
      
 15493 
     | 
    
         
            +
                    llama_state_set_data(ctx, state_data.data());
         
     | 
| 
       14153 
15494 
     | 
    
         
             
                }
         
     | 
| 
       14154 
15495 
     | 
    
         | 
| 
       14155 
15496 
     | 
    
         
             
                return true;
         
     | 
| 
       14156 
15497 
     | 
    
         
             
            }
         
     | 
| 
       14157 
15498 
     | 
    
         | 
| 
       14158 
     | 
    
         
            -
            bool  
     | 
| 
      
 15499 
     | 
    
         
            +
            bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
         
     | 
| 
       14159 
15500 
     | 
    
         
             
                try {
         
     | 
| 
       14160 
     | 
    
         
            -
                    return  
     | 
| 
      
 15501 
     | 
    
         
            +
                    return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
         
     | 
| 
       14161 
15502 
     | 
    
         
             
                } catch (const std::exception & err) {
         
     | 
| 
       14162 
15503 
     | 
    
         
             
                    LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
         
     | 
| 
       14163 
15504 
     | 
    
         
             
                    return false;
         
     | 
| 
       14164 
15505 
     | 
    
         
             
                }
         
     | 
| 
       14165 
15506 
     | 
    
         
             
            }
         
     | 
| 
       14166 
15507 
     | 
    
         | 
| 
       14167 
     | 
    
         
            -
            bool  
     | 
| 
      
 15508 
     | 
    
         
            +
            static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
         
     | 
| 
       14168 
15509 
     | 
    
         
             
                llama_file file(path_session, "wb");
         
     | 
| 
       14169 
15510 
     | 
    
         | 
| 
       14170 
15511 
     | 
    
         
             
                file.write_u32(LLAMA_SESSION_MAGIC);
         
     | 
| 
         @@ -14178,11 +15519,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi 
     | 
|
| 
       14178 
15519 
     | 
    
         | 
| 
       14179 
15520 
     | 
    
         
             
                // save the context state using stream saving
         
     | 
| 
       14180 
15521 
     | 
    
         
             
                llama_data_file_context data_ctx(&file);
         
     | 
| 
       14181 
     | 
    
         
            -
                 
     | 
| 
      
 15522 
     | 
    
         
            +
                llama_state_get_data_internal(ctx, &data_ctx);
         
     | 
| 
       14182 
15523 
     | 
    
         | 
| 
       14183 
15524 
     | 
    
         
             
                return true;
         
     | 
| 
       14184 
15525 
     | 
    
         
             
            }
         
     | 
| 
       14185 
15526 
     | 
    
         | 
| 
      
 15527 
     | 
    
         
            +
            bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
         
     | 
| 
      
 15528 
     | 
    
         
            +
                try {
         
     | 
| 
      
 15529 
     | 
    
         
            +
                    return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
         
     | 
| 
      
 15530 
     | 
    
         
            +
                } catch (const std::exception & err) {
         
     | 
| 
      
 15531 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
         
     | 
| 
      
 15532 
     | 
    
         
            +
                    return false;
         
     | 
| 
      
 15533 
     | 
    
         
            +
                }
         
     | 
| 
      
 15534 
     | 
    
         
            +
            }
         
     | 
| 
      
 15535 
     | 
    
         
            +
             
     | 
| 
      
 15536 
     | 
    
         
            +
            size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
         
     | 
| 
      
 15537 
     | 
    
         
            +
                // save the size of size_t as a uint32_t for safety check
         
     | 
| 
      
 15538 
     | 
    
         
            +
                const size_t size_t_size_size = sizeof(uint32_t);
         
     | 
| 
      
 15539 
     | 
    
         
            +
             
     | 
| 
      
 15540 
     | 
    
         
            +
                // other values
         
     | 
| 
      
 15541 
     | 
    
         
            +
                const size_t s_cell_count_size = sizeof(uint32_t);
         
     | 
| 
      
 15542 
     | 
    
         
            +
                const size_t s_layer_count_size = sizeof(uint32_t);
         
     | 
| 
      
 15543 
     | 
    
         
            +
                const size_t n_embd_v_gqa_size = sizeof(uint32_t);
         
     | 
| 
      
 15544 
     | 
    
         
            +
             
     | 
| 
      
 15545 
     | 
    
         
            +
                size_t s_cell_count = 0;
         
     | 
| 
      
 15546 
     | 
    
         
            +
                size_t s_cell_data_size = 0;
         
     | 
| 
      
 15547 
     | 
    
         
            +
                const auto & kv_self = ctx->kv_self;
         
     | 
| 
      
 15548 
     | 
    
         
            +
                const auto & hparams = ctx->model.hparams;
         
     | 
| 
      
 15549 
     | 
    
         
            +
             
     | 
| 
      
 15550 
     | 
    
         
            +
                const uint32_t n_layer = hparams.n_layer;
         
     | 
| 
      
 15551 
     | 
    
         
            +
                const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
         
     | 
| 
      
 15552 
     | 
    
         
            +
                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
         
     | 
| 
      
 15553 
     | 
    
         
            +
             
     | 
| 
      
 15554 
     | 
    
         
            +
                for (uint32_t i = 0; i < kv_self.size; ++i) {
         
     | 
| 
      
 15555 
     | 
    
         
            +
                    const auto & cell = kv_self.cells[i];
         
     | 
| 
      
 15556 
     | 
    
         
            +
                    if (cell.seq_id.count(seq_id) > 0) {
         
     | 
| 
      
 15557 
     | 
    
         
            +
                        ++s_cell_count;
         
     | 
| 
      
 15558 
     | 
    
         
            +
                        s_cell_data_size += sizeof(llama_pos);
         
     | 
| 
      
 15559 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15560 
     | 
    
         
            +
                }
         
     | 
| 
      
 15561 
     | 
    
         
            +
             
     | 
| 
      
 15562 
     | 
    
         
            +
                for (int il = 0; il < (int)n_layer; ++il) {
         
     | 
| 
      
 15563 
     | 
    
         
            +
                    // types of keys and values
         
     | 
| 
      
 15564 
     | 
    
         
            +
                    s_cell_data_size += sizeof(int32_t) * 2;
         
     | 
| 
      
 15565 
     | 
    
         
            +
                    // k_size_row and v_size_el values of layer
         
     | 
| 
      
 15566 
     | 
    
         
            +
                    s_cell_data_size += sizeof(size_t) * 2;
         
     | 
| 
      
 15567 
     | 
    
         
            +
             
     | 
| 
      
 15568 
     | 
    
         
            +
                    // keys
         
     | 
| 
      
 15569 
     | 
    
         
            +
                    const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
         
     | 
| 
      
 15570 
     | 
    
         
            +
                    s_cell_data_size += k_size_row * s_cell_count;
         
     | 
| 
      
 15571 
     | 
    
         
            +
             
     | 
| 
      
 15572 
     | 
    
         
            +
                    // values (transposed)
         
     | 
| 
      
 15573 
     | 
    
         
            +
                    const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
         
     | 
| 
      
 15574 
     | 
    
         
            +
                    s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
         
     | 
| 
      
 15575 
     | 
    
         
            +
                }
         
     | 
| 
      
 15576 
     | 
    
         
            +
             
     | 
| 
      
 15577 
     | 
    
         
            +
                const size_t s_total = (
         
     | 
| 
      
 15578 
     | 
    
         
            +
                    size_t_size_size +
         
     | 
| 
      
 15579 
     | 
    
         
            +
                    s_cell_count_size +
         
     | 
| 
      
 15580 
     | 
    
         
            +
                    s_layer_count_size +
         
     | 
| 
      
 15581 
     | 
    
         
            +
                    n_embd_v_gqa_size +
         
     | 
| 
      
 15582 
     | 
    
         
            +
                    s_cell_data_size
         
     | 
| 
      
 15583 
     | 
    
         
            +
                    );
         
     | 
| 
      
 15584 
     | 
    
         
            +
             
     | 
| 
      
 15585 
     | 
    
         
            +
                return s_total;
         
     | 
| 
      
 15586 
     | 
    
         
            +
            }
         
     | 
| 
      
 15587 
     | 
    
         
            +
             
     | 
| 
      
 15588 
     | 
    
         
            +
            static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
         
     | 
| 
      
 15589 
     | 
    
         
            +
                const auto & kv_self = ctx->kv_self;
         
     | 
| 
      
 15590 
     | 
    
         
            +
                GGML_ASSERT(!kv_self.recurrent); // not implemented
         
     | 
| 
      
 15591 
     | 
    
         
            +
             
     | 
| 
      
 15592 
     | 
    
         
            +
                // Save the size of size_t as a uint32_t for safety check
         
     | 
| 
      
 15593 
     | 
    
         
            +
                const uint32_t size_t_size = sizeof(size_t);
         
     | 
| 
      
 15594 
     | 
    
         
            +
                data_ctx.write(&size_t_size, sizeof(size_t_size));
         
     | 
| 
      
 15595 
     | 
    
         
            +
             
     | 
| 
      
 15596 
     | 
    
         
            +
                std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
         
     | 
| 
      
 15597 
     | 
    
         
            +
                uint32_t cell_count = 0;
         
     | 
| 
      
 15598 
     | 
    
         
            +
             
     | 
| 
      
 15599 
     | 
    
         
            +
                // Count the number of cells with the specified seq_id
         
     | 
| 
      
 15600 
     | 
    
         
            +
                // Find all the ranges of cells with this seq id
         
     | 
| 
      
 15601 
     | 
    
         
            +
                {
         
     | 
| 
      
 15602 
     | 
    
         
            +
                    uint32_t cell_range_begin = kv_self.size;
         
     | 
| 
      
 15603 
     | 
    
         
            +
                    for (uint32_t i = 0; i < kv_self.size; ++i) {
         
     | 
| 
      
 15604 
     | 
    
         
            +
                        const auto & cell = kv_self.cells[i];
         
     | 
| 
      
 15605 
     | 
    
         
            +
                        if (cell.has_seq_id(seq_id)) {
         
     | 
| 
      
 15606 
     | 
    
         
            +
                            ++cell_count;
         
     | 
| 
      
 15607 
     | 
    
         
            +
                            if (cell_range_begin == kv_self.size) {
         
     | 
| 
      
 15608 
     | 
    
         
            +
                                cell_range_begin = i;
         
     | 
| 
      
 15609 
     | 
    
         
            +
                            }
         
     | 
| 
      
 15610 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15611 
     | 
    
         
            +
                        else {
         
     | 
| 
      
 15612 
     | 
    
         
            +
                            if (cell_range_begin != kv_self.size) {
         
     | 
| 
      
 15613 
     | 
    
         
            +
                                cell_ranges.push_back({ cell_range_begin, i });
         
     | 
| 
      
 15614 
     | 
    
         
            +
                                cell_range_begin = kv_self.size;
         
     | 
| 
      
 15615 
     | 
    
         
            +
                            }
         
     | 
| 
      
 15616 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15617 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15618 
     | 
    
         
            +
                    if (cell_range_begin != kv_self.size) {
         
     | 
| 
      
 15619 
     | 
    
         
            +
                        cell_ranges.push_back({ cell_range_begin, kv_self.size });
         
     | 
| 
      
 15620 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15621 
     | 
    
         
            +
             
     | 
| 
      
 15622 
     | 
    
         
            +
                    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
         
     | 
| 
      
 15623 
     | 
    
         
            +
                    uint32_t cell_count_check = 0;
         
     | 
| 
      
 15624 
     | 
    
         
            +
                    for (const auto & range : cell_ranges) {
         
     | 
| 
      
 15625 
     | 
    
         
            +
                        cell_count_check += range.second - range.first;
         
     | 
| 
      
 15626 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15627 
     | 
    
         
            +
                    GGML_ASSERT(cell_count == cell_count_check);
         
     | 
| 
      
 15628 
     | 
    
         
            +
                }
         
     | 
| 
      
 15629 
     | 
    
         
            +
             
     | 
| 
      
 15630 
     | 
    
         
            +
                // Write the cell count
         
     | 
| 
      
 15631 
     | 
    
         
            +
                data_ctx.write(&cell_count, sizeof(cell_count));
         
     | 
| 
      
 15632 
     | 
    
         
            +
             
     | 
| 
      
 15633 
     | 
    
         
            +
                const auto & hparams = ctx->model.hparams;
         
     | 
| 
      
 15634 
     | 
    
         
            +
                const uint32_t n_layer = hparams.n_layer;
         
     | 
| 
      
 15635 
     | 
    
         
            +
                const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
         
     | 
| 
      
 15636 
     | 
    
         
            +
                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
         
     | 
| 
      
 15637 
     | 
    
         
            +
             
     | 
| 
      
 15638 
     | 
    
         
            +
                // Write the layer count
         
     | 
| 
      
 15639 
     | 
    
         
            +
                data_ctx.write(&n_layer, sizeof(n_layer));
         
     | 
| 
      
 15640 
     | 
    
         
            +
             
     | 
| 
      
 15641 
     | 
    
         
            +
                // Write n_embd_v_gqa
         
     | 
| 
      
 15642 
     | 
    
         
            +
                data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
         
     | 
| 
      
 15643 
     | 
    
         
            +
             
     | 
| 
      
 15644 
     | 
    
         
            +
                // Iterate the ranges and write all the pos (this is the token position in the prompt)
         
     | 
| 
      
 15645 
     | 
    
         
            +
                for (const auto & range : cell_ranges) {
         
     | 
| 
      
 15646 
     | 
    
         
            +
                    for (uint32_t i = range.first; i < range.second; ++i) {
         
     | 
| 
      
 15647 
     | 
    
         
            +
                        const auto & cell = kv_self.cells[i];
         
     | 
| 
      
 15648 
     | 
    
         
            +
                        data_ctx.write(&cell.pos, sizeof(cell.pos));
         
     | 
| 
      
 15649 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15650 
     | 
    
         
            +
                }
         
     | 
| 
      
 15651 
     | 
    
         
            +
             
     | 
| 
      
 15652 
     | 
    
         
            +
                // Iterate and write all the keys first, each row is a cell
         
     | 
| 
      
 15653 
     | 
    
         
            +
                // Get whole range at a time
         
     | 
| 
      
 15654 
     | 
    
         
            +
                std::vector<uint8_t> tmp_buf;
         
     | 
| 
      
 15655 
     | 
    
         
            +
                for (int il = 0; il < (int)n_layer; ++il) {
         
     | 
| 
      
 15656 
     | 
    
         
            +
                    // Write key type
         
     | 
| 
      
 15657 
     | 
    
         
            +
                    const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
         
     | 
| 
      
 15658 
     | 
    
         
            +
                    data_ctx.write(&k_type_i, sizeof(k_type_i));
         
     | 
| 
      
 15659 
     | 
    
         
            +
             
     | 
| 
      
 15660 
     | 
    
         
            +
                    // Write row size of key
         
     | 
| 
      
 15661 
     | 
    
         
            +
                    const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
         
     | 
| 
      
 15662 
     | 
    
         
            +
                    data_ctx.write(&k_size_row, sizeof(k_size_row));
         
     | 
| 
      
 15663 
     | 
    
         
            +
             
     | 
| 
      
 15664 
     | 
    
         
            +
                    // Read each range of cells of k_size length each into tmp_buf and write out
         
     | 
| 
      
 15665 
     | 
    
         
            +
                    for (const auto & range : cell_ranges) {
         
     | 
| 
      
 15666 
     | 
    
         
            +
                        const size_t range_size = range.second - range.first;
         
     | 
| 
      
 15667 
     | 
    
         
            +
                        tmp_buf.resize(range_size * k_size_row);
         
     | 
| 
      
 15668 
     | 
    
         
            +
                        ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
         
     | 
| 
      
 15669 
     | 
    
         
            +
                        data_ctx.write(tmp_buf.data(), tmp_buf.size());
         
     | 
| 
      
 15670 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15671 
     | 
    
         
            +
                }
         
     | 
| 
      
 15672 
     | 
    
         
            +
             
     | 
| 
      
 15673 
     | 
    
         
            +
                // For the values, they are transposed, so we also need the element size and get the element ranges from each row
         
     | 
| 
      
 15674 
     | 
    
         
            +
                const uint32_t kv_size = kv_self.size;
         
     | 
| 
      
 15675 
     | 
    
         
            +
                for (int il = 0; il < (int)n_layer; ++il) {
         
     | 
| 
      
 15676 
     | 
    
         
            +
                    // Write value type
         
     | 
| 
      
 15677 
     | 
    
         
            +
                    const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
         
     | 
| 
      
 15678 
     | 
    
         
            +
                    data_ctx.write(&v_type_i, sizeof(v_type_i));
         
     | 
| 
      
 15679 
     | 
    
         
            +
             
     | 
| 
      
 15680 
     | 
    
         
            +
                    // Write element size
         
     | 
| 
      
 15681 
     | 
    
         
            +
                    const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
         
     | 
| 
      
 15682 
     | 
    
         
            +
                    data_ctx.write(&v_size_el, sizeof(v_size_el));
         
     | 
| 
      
 15683 
     | 
    
         
            +
             
     | 
| 
      
 15684 
     | 
    
         
            +
                    // For each row, we get the element values of each cell
         
     | 
| 
      
 15685 
     | 
    
         
            +
                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
         
     | 
| 
      
 15686 
     | 
    
         
            +
                        // Read each range of cells of v_size_el length each into tmp_buf and write out
         
     | 
| 
      
 15687 
     | 
    
         
            +
                        for (const auto & range : cell_ranges) {
         
     | 
| 
      
 15688 
     | 
    
         
            +
                            const size_t range_size = range.second - range.first;
         
     | 
| 
      
 15689 
     | 
    
         
            +
                            const size_t src_offset = (range.first + j * kv_size) * v_size_el;
         
     | 
| 
      
 15690 
     | 
    
         
            +
                            tmp_buf.resize(range_size * v_size_el);
         
     | 
| 
      
 15691 
     | 
    
         
            +
                            ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
         
     | 
| 
      
 15692 
     | 
    
         
            +
                            data_ctx.write(tmp_buf.data(), tmp_buf.size());
         
     | 
| 
      
 15693 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15694 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15695 
     | 
    
         
            +
                }
         
     | 
| 
      
 15696 
     | 
    
         
            +
             
     | 
| 
      
 15697 
     | 
    
         
            +
                return data_ctx.get_size_written();
         
     | 
| 
      
 15698 
     | 
    
         
            +
            }
         
     | 
| 
      
 15699 
     | 
    
         
            +
             
     | 
| 
      
 15700 
     | 
    
         
            +
            size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
         
     | 
| 
      
 15701 
     | 
    
         
            +
                llama_data_buffer_context data_ctx(dst);
         
     | 
| 
      
 15702 
     | 
    
         
            +
                return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
         
     | 
| 
      
 15703 
     | 
    
         
            +
            }
         
     | 
| 
      
 15704 
     | 
    
         
            +
             
     | 
| 
      
 15705 
     | 
    
         
            +
            size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
         
     | 
| 
      
 15706 
     | 
    
         
            +
                auto & kv_self = ctx->kv_self;
         
     | 
| 
      
 15707 
     | 
    
         
            +
                GGML_ASSERT(!kv_self.recurrent); // not implemented
         
     | 
| 
      
 15708 
     | 
    
         
            +
             
     | 
| 
      
 15709 
     | 
    
         
            +
                // Wipe the slot
         
     | 
| 
      
 15710 
     | 
    
         
            +
                llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
         
     | 
| 
      
 15711 
     | 
    
         
            +
             
     | 
| 
      
 15712 
     | 
    
         
            +
                const uint8_t * inp = src;
         
     | 
| 
      
 15713 
     | 
    
         
            +
             
     | 
| 
      
 15714 
     | 
    
         
            +
                // Read size of size_t
         
     | 
| 
      
 15715 
     | 
    
         
            +
                uint32_t size_t_size;
         
     | 
| 
      
 15716 
     | 
    
         
            +
                memcpy(&size_t_size, inp, sizeof(size_t_size));
         
     | 
| 
      
 15717 
     | 
    
         
            +
                inp += sizeof(size_t_size);
         
     | 
| 
      
 15718 
     | 
    
         
            +
                if (size_t_size != sizeof(size_t)) {
         
     | 
| 
      
 15719 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
         
     | 
| 
      
 15720 
     | 
    
         
            +
                    return 0;
         
     | 
| 
      
 15721 
     | 
    
         
            +
                }
         
     | 
| 
      
 15722 
     | 
    
         
            +
             
     | 
| 
      
 15723 
     | 
    
         
            +
                // Read the cell count
         
     | 
| 
      
 15724 
     | 
    
         
            +
                uint32_t cell_count;
         
     | 
| 
      
 15725 
     | 
    
         
            +
                memcpy(&cell_count, inp, sizeof(cell_count));
         
     | 
| 
      
 15726 
     | 
    
         
            +
                inp += sizeof(cell_count);
         
     | 
| 
      
 15727 
     | 
    
         
            +
             
     | 
| 
      
 15728 
     | 
    
         
            +
                // Read the layer count
         
     | 
| 
      
 15729 
     | 
    
         
            +
                uint32_t n_layer_ref;
         
     | 
| 
      
 15730 
     | 
    
         
            +
                memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
         
     | 
| 
      
 15731 
     | 
    
         
            +
                inp += sizeof(n_layer_ref);
         
     | 
| 
      
 15732 
     | 
    
         
            +
             
     | 
| 
      
 15733 
     | 
    
         
            +
                // Read n_embd_v_gqa
         
     | 
| 
      
 15734 
     | 
    
         
            +
                uint32_t n_embd_v_gqa_ref;
         
     | 
| 
      
 15735 
     | 
    
         
            +
                memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
         
     | 
| 
      
 15736 
     | 
    
         
            +
                inp += sizeof(n_embd_v_gqa_ref);
         
     | 
| 
      
 15737 
     | 
    
         
            +
             
     | 
| 
      
 15738 
     | 
    
         
            +
                // Sanity check model compatibility
         
     | 
| 
      
 15739 
     | 
    
         
            +
                const auto & hparams = ctx->model.hparams;
         
     | 
| 
      
 15740 
     | 
    
         
            +
                const uint32_t n_layer = hparams.n_layer;
         
     | 
| 
      
 15741 
     | 
    
         
            +
                const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
         
     | 
| 
      
 15742 
     | 
    
         
            +
                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
         
     | 
| 
      
 15743 
     | 
    
         
            +
                if (n_layer != n_layer_ref) {
         
     | 
| 
      
 15744 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
         
     | 
| 
      
 15745 
     | 
    
         
            +
                    return 0;
         
     | 
| 
      
 15746 
     | 
    
         
            +
                }
         
     | 
| 
      
 15747 
     | 
    
         
            +
                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
         
     | 
| 
      
 15748 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
         
     | 
| 
      
 15749 
     | 
    
         
            +
                    return 0;
         
     | 
| 
      
 15750 
     | 
    
         
            +
                }
         
     | 
| 
      
 15751 
     | 
    
         
            +
             
     | 
| 
      
 15752 
     | 
    
         
            +
                // Allocate the new cells for the slot
         
     | 
| 
      
 15753 
     | 
    
         
            +
                if (cell_count) {
         
     | 
| 
      
 15754 
     | 
    
         
            +
                    llama_batch batch = llama_batch_init(cell_count, 0, 1);
         
     | 
| 
      
 15755 
     | 
    
         
            +
                    batch.n_tokens = cell_count;
         
     | 
| 
      
 15756 
     | 
    
         
            +
                    for (uint32_t i = 0; i < cell_count; ++i) {
         
     | 
| 
      
 15757 
     | 
    
         
            +
                        llama_pos pos;
         
     | 
| 
      
 15758 
     | 
    
         
            +
                        memcpy(&pos, inp, sizeof(pos));
         
     | 
| 
      
 15759 
     | 
    
         
            +
                        inp += sizeof(pos);
         
     | 
| 
      
 15760 
     | 
    
         
            +
             
     | 
| 
      
 15761 
     | 
    
         
            +
                        batch.pos[i] = pos;
         
     | 
| 
      
 15762 
     | 
    
         
            +
                        batch.n_seq_id[i] = 1;
         
     | 
| 
      
 15763 
     | 
    
         
            +
                        batch.seq_id[i][0] = dest_seq_id;
         
     | 
| 
      
 15764 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15765 
     | 
    
         
            +
                    if (!llama_kv_cache_find_slot(kv_self, batch)) {
         
     | 
| 
      
 15766 
     | 
    
         
            +
                        llama_batch_free(batch);
         
     | 
| 
      
 15767 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
         
     | 
| 
      
 15768 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15769 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15770 
     | 
    
         
            +
             
     | 
| 
      
 15771 
     | 
    
         
            +
                    // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
         
     | 
| 
      
 15772 
     | 
    
         
            +
                    // Assume that this is one contiguous block of cells
         
     | 
| 
      
 15773 
     | 
    
         
            +
                    GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
         
     | 
| 
      
 15774 
     | 
    
         
            +
                    GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
         
     | 
| 
      
 15775 
     | 
    
         
            +
                    GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
         
     | 
| 
      
 15776 
     | 
    
         
            +
                    GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
         
     | 
| 
      
 15777 
     | 
    
         
            +
                    GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
         
     | 
| 
      
 15778 
     | 
    
         
            +
             
     | 
| 
      
 15779 
     | 
    
         
            +
                    // Cleanup
         
     | 
| 
      
 15780 
     | 
    
         
            +
                    llama_batch_free(batch);
         
     | 
| 
      
 15781 
     | 
    
         
            +
                }
         
     | 
| 
      
 15782 
     | 
    
         
            +
             
     | 
| 
      
 15783 
     | 
    
         
            +
                const uint32_t kv_size = kv_self.size;
         
     | 
| 
      
 15784 
     | 
    
         
            +
                const uint32_t kv_head = kv_self.head;
         
     | 
| 
      
 15785 
     | 
    
         
            +
             
     | 
| 
      
 15786 
     | 
    
         
            +
                // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
         
     | 
| 
      
 15787 
     | 
    
         
            +
                for (int il = 0; il < (int)n_layer; ++il) {
         
     | 
| 
      
 15788 
     | 
    
         
            +
                    // Read type of key
         
     | 
| 
      
 15789 
     | 
    
         
            +
                    int32_t k_type_i_ref;
         
     | 
| 
      
 15790 
     | 
    
         
            +
                    memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
         
     | 
| 
      
 15791 
     | 
    
         
            +
                    inp += sizeof(k_type_i_ref);
         
     | 
| 
      
 15792 
     | 
    
         
            +
                    const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
         
     | 
| 
      
 15793 
     | 
    
         
            +
                    if (k_type_i != k_type_i_ref) {
         
     | 
| 
      
 15794 
     | 
    
         
            +
                        llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
         
     | 
| 
      
 15795 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
         
     | 
| 
      
 15796 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15797 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15798 
     | 
    
         
            +
             
     | 
| 
      
 15799 
     | 
    
         
            +
                    // Read row size of key
         
     | 
| 
      
 15800 
     | 
    
         
            +
                    size_t k_size_row_ref;
         
     | 
| 
      
 15801 
     | 
    
         
            +
                    memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
         
     | 
| 
      
 15802 
     | 
    
         
            +
                    inp += sizeof(k_size_row_ref);
         
     | 
| 
      
 15803 
     | 
    
         
            +
                    const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
         
     | 
| 
      
 15804 
     | 
    
         
            +
                    if (k_size_row != k_size_row_ref) {
         
     | 
| 
      
 15805 
     | 
    
         
            +
                        llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
         
     | 
| 
      
 15806 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
         
     | 
| 
      
 15807 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15808 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15809 
     | 
    
         
            +
             
     | 
| 
      
 15810 
     | 
    
         
            +
                    if (cell_count) {
         
     | 
| 
      
 15811 
     | 
    
         
            +
                        // Read and set the keys for the whole cell range
         
     | 
| 
      
 15812 
     | 
    
         
            +
                        ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
         
     | 
| 
      
 15813 
     | 
    
         
            +
                        inp += cell_count * k_size_row;
         
     | 
| 
      
 15814 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15815 
     | 
    
         
            +
                }
         
     | 
| 
      
 15816 
     | 
    
         
            +
             
     | 
| 
      
 15817 
     | 
    
         
            +
                // For each layer, read the values for each cell (transposed)
         
     | 
| 
      
 15818 
     | 
    
         
            +
                for (int il = 0; il < (int)n_layer; ++il) {
         
     | 
| 
      
 15819 
     | 
    
         
            +
                    // Read type of value
         
     | 
| 
      
 15820 
     | 
    
         
            +
                    int32_t v_type_i_ref;
         
     | 
| 
      
 15821 
     | 
    
         
            +
                    memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
         
     | 
| 
      
 15822 
     | 
    
         
            +
                    inp += sizeof(v_type_i_ref);
         
     | 
| 
      
 15823 
     | 
    
         
            +
                    const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
         
     | 
| 
      
 15824 
     | 
    
         
            +
                    if (v_type_i != v_type_i_ref) {
         
     | 
| 
      
 15825 
     | 
    
         
            +
                        llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
         
     | 
| 
      
 15826 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
         
     | 
| 
      
 15827 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15828 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15829 
     | 
    
         
            +
             
     | 
| 
      
 15830 
     | 
    
         
            +
                    // Read element size of value
         
     | 
| 
      
 15831 
     | 
    
         
            +
                    size_t v_size_el_ref;
         
     | 
| 
      
 15832 
     | 
    
         
            +
                    memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
         
     | 
| 
      
 15833 
     | 
    
         
            +
                    inp += sizeof(v_size_el_ref);
         
     | 
| 
      
 15834 
     | 
    
         
            +
                    const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
         
     | 
| 
      
 15835 
     | 
    
         
            +
                    if (v_size_el != v_size_el_ref) {
         
     | 
| 
      
 15836 
     | 
    
         
            +
                        llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
         
     | 
| 
      
 15837 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
         
     | 
| 
      
 15838 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15839 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15840 
     | 
    
         
            +
             
     | 
| 
      
 15841 
     | 
    
         
            +
                    if (cell_count) {
         
     | 
| 
      
 15842 
     | 
    
         
            +
                        // For each row in the transposed matrix, read the values for the whole cell range
         
     | 
| 
      
 15843 
     | 
    
         
            +
                        for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
         
     | 
| 
      
 15844 
     | 
    
         
            +
                            const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
         
     | 
| 
      
 15845 
     | 
    
         
            +
                            ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
         
     | 
| 
      
 15846 
     | 
    
         
            +
                            inp += cell_count * v_size_el;
         
     | 
| 
      
 15847 
     | 
    
         
            +
                        }
         
     | 
| 
      
 15848 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15849 
     | 
    
         
            +
                }
         
     | 
| 
      
 15850 
     | 
    
         
            +
             
     | 
| 
      
 15851 
     | 
    
         
            +
                const size_t nread = inp - src;
         
     | 
| 
      
 15852 
     | 
    
         
            +
                return nread;
         
     | 
| 
      
 15853 
     | 
    
         
            +
            }
         
     | 
| 
      
 15854 
     | 
    
         
            +
             
     | 
| 
      
 15855 
     | 
    
         
            +
            static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
         
     | 
| 
      
 15856 
     | 
    
         
            +
                llama_file file(filepath, "wb");
         
     | 
| 
      
 15857 
     | 
    
         
            +
             
     | 
| 
      
 15858 
     | 
    
         
            +
                file.write_u32(LLAMA_STATE_SEQ_MAGIC);
         
     | 
| 
      
 15859 
     | 
    
         
            +
                file.write_u32(LLAMA_STATE_SEQ_VERSION);
         
     | 
| 
      
 15860 
     | 
    
         
            +
             
     | 
| 
      
 15861 
     | 
    
         
            +
                // save the prompt
         
     | 
| 
      
 15862 
     | 
    
         
            +
                file.write_u32((uint32_t)n_token_count);
         
     | 
| 
      
 15863 
     | 
    
         
            +
                file.write_raw(tokens, sizeof(llama_token) * n_token_count);
         
     | 
| 
      
 15864 
     | 
    
         
            +
             
     | 
| 
      
 15865 
     | 
    
         
            +
                // save the context state using stream saving
         
     | 
| 
      
 15866 
     | 
    
         
            +
                llama_data_file_context data_ctx(&file);
         
     | 
| 
      
 15867 
     | 
    
         
            +
                llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
         
     | 
| 
      
 15868 
     | 
    
         
            +
             
     | 
| 
      
 15869 
     | 
    
         
            +
                const size_t res = file.tell();
         
     | 
| 
      
 15870 
     | 
    
         
            +
                GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
         
     | 
| 
      
 15871 
     | 
    
         
            +
                return res;
         
     | 
| 
      
 15872 
     | 
    
         
            +
            }
         
     | 
| 
      
 15873 
     | 
    
         
            +
             
     | 
| 
      
 15874 
     | 
    
         
            +
            static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
         
     | 
| 
      
 15875 
     | 
    
         
            +
                llama_file file(filepath, "rb");
         
     | 
| 
      
 15876 
     | 
    
         
            +
             
     | 
| 
      
 15877 
     | 
    
         
            +
                // version checks
         
     | 
| 
      
 15878 
     | 
    
         
            +
                {
         
     | 
| 
      
 15879 
     | 
    
         
            +
                    const uint32_t magic   = file.read_u32();
         
     | 
| 
      
 15880 
     | 
    
         
            +
                    const uint32_t version = file.read_u32();
         
     | 
| 
      
 15881 
     | 
    
         
            +
             
     | 
| 
      
 15882 
     | 
    
         
            +
                    if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
         
     | 
| 
      
 15883 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
         
     | 
| 
      
 15884 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15885 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15886 
     | 
    
         
            +
                }
         
     | 
| 
      
 15887 
     | 
    
         
            +
             
     | 
| 
      
 15888 
     | 
    
         
            +
                // load the prompt
         
     | 
| 
      
 15889 
     | 
    
         
            +
                {
         
     | 
| 
      
 15890 
     | 
    
         
            +
                    const uint32_t n_token_count = file.read_u32();
         
     | 
| 
      
 15891 
     | 
    
         
            +
             
     | 
| 
      
 15892 
     | 
    
         
            +
                    if (n_token_count > n_token_capacity) {
         
     | 
| 
      
 15893 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
         
     | 
| 
      
 15894 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15895 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15896 
     | 
    
         
            +
             
     | 
| 
      
 15897 
     | 
    
         
            +
                    file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
         
     | 
| 
      
 15898 
     | 
    
         
            +
                    *n_token_count_out = n_token_count;
         
     | 
| 
      
 15899 
     | 
    
         
            +
                }
         
     | 
| 
      
 15900 
     | 
    
         
            +
             
     | 
| 
      
 15901 
     | 
    
         
            +
                // restore the context state
         
     | 
| 
      
 15902 
     | 
    
         
            +
                {
         
     | 
| 
      
 15903 
     | 
    
         
            +
                    const size_t state_size = file.size - file.tell();
         
     | 
| 
      
 15904 
     | 
    
         
            +
                    std::vector<uint8_t> state_data(state_size);
         
     | 
| 
      
 15905 
     | 
    
         
            +
                    file.read_raw(state_data.data(), state_size);
         
     | 
| 
      
 15906 
     | 
    
         
            +
                    const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
         
     | 
| 
      
 15907 
     | 
    
         
            +
                    if (!nread) {
         
     | 
| 
      
 15908 
     | 
    
         
            +
                        LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
         
     | 
| 
      
 15909 
     | 
    
         
            +
                        return 0;
         
     | 
| 
      
 15910 
     | 
    
         
            +
                    }
         
     | 
| 
      
 15911 
     | 
    
         
            +
                    GGML_ASSERT(nread <= state_size);
         
     | 
| 
      
 15912 
     | 
    
         
            +
                    GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
         
     | 
| 
      
 15913 
     | 
    
         
            +
                }
         
     | 
| 
      
 15914 
     | 
    
         
            +
             
     | 
| 
      
 15915 
     | 
    
         
            +
                return file.tell();
         
     | 
| 
      
 15916 
     | 
    
         
            +
            }
         
     | 
| 
      
 15917 
     | 
    
         
            +
             
     | 
| 
      
 15918 
     | 
    
         
            +
            size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
         
     | 
| 
      
 15919 
     | 
    
         
            +
                try {
         
     | 
| 
      
 15920 
     | 
    
         
            +
                    return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
         
     | 
| 
      
 15921 
     | 
    
         
            +
                } catch (const std::exception & err) {
         
     | 
| 
      
 15922 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
         
     | 
| 
      
 15923 
     | 
    
         
            +
                    return 0;
         
     | 
| 
      
 15924 
     | 
    
         
            +
                }
         
     | 
| 
      
 15925 
     | 
    
         
            +
            }
         
     | 
| 
      
 15926 
     | 
    
         
            +
             
     | 
| 
      
 15927 
     | 
    
         
            +
            size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
         
     | 
| 
      
 15928 
     | 
    
         
            +
                try {
         
     | 
| 
      
 15929 
     | 
    
         
            +
                    return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
         
     | 
| 
      
 15930 
     | 
    
         
            +
                } catch (const std::exception & err) {
         
     | 
| 
      
 15931 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
         
     | 
| 
      
 15932 
     | 
    
         
            +
                    return 0;
         
     | 
| 
      
 15933 
     | 
    
         
            +
                }
         
     | 
| 
      
 15934 
     | 
    
         
            +
            }
         
     | 
| 
      
 15935 
     | 
    
         
            +
             
     | 
| 
       14186 
15936 
     | 
    
         
             
            void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
         
     | 
| 
       14187 
15937 
     | 
    
         
             
                ctx->cparams.n_threads       = n_threads;
         
     | 
| 
       14188 
15938 
     | 
    
         
             
                ctx->cparams.n_threads_batch = n_threads_batch;
         
     | 
| 
         @@ -14296,11 +16046,41 @@ float * llama_get_logits(struct llama_context * ctx) { 
     | 
|
| 
       14296 
16046 
     | 
    
         
             
            }
         
     | 
| 
       14297 
16047 
     | 
    
         | 
| 
       14298 
16048 
     | 
    
         
             
            float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
         
     | 
| 
       14299 
     | 
    
         
            -
                 
     | 
| 
       14300 
     | 
    
         
            -
             
     | 
| 
      
 16049 
     | 
    
         
            +
                int32_t j = -1;
         
     | 
| 
       14301 
16050 
     | 
    
         
             
                llama_synchronize(ctx);
         
     | 
| 
       14302 
16051 
     | 
    
         | 
| 
       14303 
     | 
    
         
            -
                 
     | 
| 
      
 16052 
     | 
    
         
            +
                try {
         
     | 
| 
      
 16053 
     | 
    
         
            +
                    if (ctx->logits == nullptr) {
         
     | 
| 
      
 16054 
     | 
    
         
            +
                        throw std::runtime_error("no logits");
         
     | 
| 
      
 16055 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16056 
     | 
    
         
            +
             
     | 
| 
      
 16057 
     | 
    
         
            +
                    if (i < 0) {
         
     | 
| 
      
 16058 
     | 
    
         
            +
                        j = ctx->n_outputs + i;
         
     | 
| 
      
 16059 
     | 
    
         
            +
                        if (j < 0) {
         
     | 
| 
      
 16060 
     | 
    
         
            +
                            throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
         
     | 
| 
      
 16061 
     | 
    
         
            +
                        }
         
     | 
| 
      
 16062 
     | 
    
         
            +
                    } else if ((size_t) i >= ctx->output_ids.size()) {
         
     | 
| 
      
 16063 
     | 
    
         
            +
                        throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
         
     | 
| 
      
 16064 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 16065 
     | 
    
         
            +
                        j = ctx->output_ids[i];
         
     | 
| 
      
 16066 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16067 
     | 
    
         
            +
             
     | 
| 
      
 16068 
     | 
    
         
            +
                    if (j < 0) {
         
     | 
| 
      
 16069 
     | 
    
         
            +
                        throw std::runtime_error(format("batch.logits[%d] != true", i));
         
     | 
| 
      
 16070 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16071 
     | 
    
         
            +
                    if (j >= ctx->n_outputs) {
         
     | 
| 
      
 16072 
     | 
    
         
            +
                        // This should not happen
         
     | 
| 
      
 16073 
     | 
    
         
            +
                        throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
         
     | 
| 
      
 16074 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16075 
     | 
    
         
            +
             
     | 
| 
      
 16076 
     | 
    
         
            +
                    return ctx->logits + j*ctx->model.hparams.n_vocab;
         
     | 
| 
      
 16077 
     | 
    
         
            +
                } catch (const std::exception & err) {
         
     | 
| 
      
 16078 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
         
     | 
| 
      
 16079 
     | 
    
         
            +
            #ifndef NDEBUG
         
     | 
| 
      
 16080 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 16081 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 16082 
     | 
    
         
            +
                    return nullptr;
         
     | 
| 
      
 16083 
     | 
    
         
            +
                }
         
     | 
| 
       14304 
16084 
     | 
    
         
             
            }
         
     | 
| 
       14305 
16085 
     | 
    
         | 
| 
       14306 
16086 
     | 
    
         
             
            float * llama_get_embeddings(struct llama_context * ctx) {
         
     | 
| 
         @@ -14310,9 +16090,42 @@ float * llama_get_embeddings(struct llama_context * ctx) { 
     | 
|
| 
       14310 
16090 
     | 
    
         
             
            }
         
     | 
| 
       14311 
16091 
     | 
    
         | 
| 
       14312 
16092 
     | 
    
         
             
            float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
         
     | 
| 
      
 16093 
     | 
    
         
            +
                int32_t j = -1;
         
     | 
| 
      
 16094 
     | 
    
         
            +
             
     | 
| 
       14313 
16095 
     | 
    
         
             
                llama_synchronize(ctx);
         
     | 
| 
       14314 
16096 
     | 
    
         | 
| 
       14315 
     | 
    
         
            -
                 
     | 
| 
      
 16097 
     | 
    
         
            +
                try {
         
     | 
| 
      
 16098 
     | 
    
         
            +
                    if (ctx->embd == nullptr) {
         
     | 
| 
      
 16099 
     | 
    
         
            +
                        throw std::runtime_error("no embeddings");
         
     | 
| 
      
 16100 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16101 
     | 
    
         
            +
             
     | 
| 
      
 16102 
     | 
    
         
            +
                    if (i < 0) {
         
     | 
| 
      
 16103 
     | 
    
         
            +
                        j = ctx->n_outputs + i;
         
     | 
| 
      
 16104 
     | 
    
         
            +
                        if (j < 0) {
         
     | 
| 
      
 16105 
     | 
    
         
            +
                            throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
         
     | 
| 
      
 16106 
     | 
    
         
            +
                        }
         
     | 
| 
      
 16107 
     | 
    
         
            +
                    } else if ((size_t) i >= ctx->output_ids.size()) {
         
     | 
| 
      
 16108 
     | 
    
         
            +
                        throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
         
     | 
| 
      
 16109 
     | 
    
         
            +
                    } else {
         
     | 
| 
      
 16110 
     | 
    
         
            +
                        j = ctx->output_ids[i];
         
     | 
| 
      
 16111 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16112 
     | 
    
         
            +
             
     | 
| 
      
 16113 
     | 
    
         
            +
                    if (j < 0) {
         
     | 
| 
      
 16114 
     | 
    
         
            +
                        throw std::runtime_error(format("batch.logits[%d] != true", i));
         
     | 
| 
      
 16115 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16116 
     | 
    
         
            +
                    if (j >= ctx->n_outputs) {
         
     | 
| 
      
 16117 
     | 
    
         
            +
                        // This should not happen
         
     | 
| 
      
 16118 
     | 
    
         
            +
                        throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
         
     | 
| 
      
 16119 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16120 
     | 
    
         
            +
             
     | 
| 
      
 16121 
     | 
    
         
            +
                    return ctx->embd + j*ctx->model.hparams.n_embd;
         
     | 
| 
      
 16122 
     | 
    
         
            +
                } catch (const std::exception & err) {
         
     | 
| 
      
 16123 
     | 
    
         
            +
                    LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
         
     | 
| 
      
 16124 
     | 
    
         
            +
            #ifndef NDEBUG
         
     | 
| 
      
 16125 
     | 
    
         
            +
                    GGML_ASSERT(false);
         
     | 
| 
      
 16126 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 16127 
     | 
    
         
            +
                    return nullptr;
         
     | 
| 
      
 16128 
     | 
    
         
            +
                }
         
     | 
| 
       14316 
16129 
     | 
    
         
             
            }
         
     | 
| 
       14317 
16130 
     | 
    
         | 
| 
       14318 
16131 
     | 
    
         
             
            float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
         
     | 
| 
         @@ -14349,6 +16162,14 @@ llama_token llama_token_eos(const struct llama_model * model) { 
     | 
|
| 
       14349 
16162 
     | 
    
         
             
                return model->vocab.special_eos_id;
         
     | 
| 
       14350 
16163 
     | 
    
         
             
            }
         
     | 
| 
       14351 
16164 
     | 
    
         | 
| 
      
 16165 
     | 
    
         
            +
            llama_token llama_token_cls(const struct llama_model * model) {
         
     | 
| 
      
 16166 
     | 
    
         
            +
                return model->vocab.special_cls_id;
         
     | 
| 
      
 16167 
     | 
    
         
            +
            }
         
     | 
| 
      
 16168 
     | 
    
         
            +
             
     | 
| 
      
 16169 
     | 
    
         
            +
            llama_token llama_token_sep(const struct llama_model * model) {
         
     | 
| 
      
 16170 
     | 
    
         
            +
                return model->vocab.special_sep_id;
         
     | 
| 
      
 16171 
     | 
    
         
            +
            }
         
     | 
| 
      
 16172 
     | 
    
         
            +
             
     | 
| 
       14352 
16173 
     | 
    
         
             
            llama_token llama_token_nl(const struct llama_model * model) {
         
     | 
| 
       14353 
16174 
     | 
    
         
             
                return model->vocab.linefeed_id;
         
     | 
| 
       14354 
16175 
     | 
    
         
             
            }
         
     | 
| 
         @@ -14383,9 +16204,9 @@ int32_t llama_tokenize( 
     | 
|
| 
       14383 
16204 
     | 
    
         
             
                                 int32_t   text_len,
         
     | 
| 
       14384 
16205 
     | 
    
         
             
                             llama_token * tokens,
         
     | 
| 
       14385 
16206 
     | 
    
         
             
                                 int32_t   n_tokens_max,
         
     | 
| 
       14386 
     | 
    
         
            -
                                    bool    
     | 
| 
       14387 
     | 
    
         
            -
                                    bool    
     | 
| 
       14388 
     | 
    
         
            -
                auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),  
     | 
| 
      
 16207 
     | 
    
         
            +
                                    bool   add_special,
         
     | 
| 
      
 16208 
     | 
    
         
            +
                                    bool   parse_special) {
         
     | 
| 
      
 16209 
     | 
    
         
            +
                auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
         
     | 
| 
       14389 
16210 
     | 
    
         | 
| 
       14390 
16211 
     | 
    
         
             
                if (n_tokens_max < (int) res.size()) {
         
     | 
| 
       14391 
16212 
     | 
    
         
             
                    // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
         
     | 
| 
         @@ -14602,6 +16423,55 @@ static int32_t llama_chat_apply_template_internal( 
     | 
|
| 
       14602 
16423 
     | 
    
         
             
                            ss << message->content << "</s>";
         
     | 
| 
       14603 
16424 
     | 
    
         
             
                        }
         
     | 
| 
       14604 
16425 
     | 
    
         
             
                    }
         
     | 
| 
      
 16426 
     | 
    
         
            +
                } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
         
     | 
| 
      
 16427 
     | 
    
         
            +
                    // openchat/openchat-3.5-0106,
         
     | 
| 
      
 16428 
     | 
    
         
            +
                    for (auto message : chat) {
         
     | 
| 
      
 16429 
     | 
    
         
            +
                        std::string role(message->role);
         
     | 
| 
      
 16430 
     | 
    
         
            +
                        if (role == "system") {
         
     | 
| 
      
 16431 
     | 
    
         
            +
                            ss << message->content << "<|end_of_turn|>";
         
     | 
| 
      
 16432 
     | 
    
         
            +
                        } else {
         
     | 
| 
      
 16433 
     | 
    
         
            +
                            role[0] = toupper(role[0]);
         
     | 
| 
      
 16434 
     | 
    
         
            +
                            ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
         
     | 
| 
      
 16435 
     | 
    
         
            +
                        }
         
     | 
| 
      
 16436 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16437 
     | 
    
         
            +
                    if (add_ass) {
         
     | 
| 
      
 16438 
     | 
    
         
            +
                        ss << "GPT4 Correct Assistant:";
         
     | 
| 
      
 16439 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16440 
     | 
    
         
            +
                } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
         
     | 
| 
      
 16441 
     | 
    
         
            +
                    // eachadea/vicuna-13b-1.1 (and Orca variant)
         
     | 
| 
      
 16442 
     | 
    
         
            +
                    for (auto message : chat) {
         
     | 
| 
      
 16443 
     | 
    
         
            +
                        std::string role(message->role);
         
     | 
| 
      
 16444 
     | 
    
         
            +
                        if (role == "system") {
         
     | 
| 
      
 16445 
     | 
    
         
            +
                            // Orca-Vicuna variant uses a system prefix
         
     | 
| 
      
 16446 
     | 
    
         
            +
                            if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
         
     | 
| 
      
 16447 
     | 
    
         
            +
                                ss << "SYSTEM: " << message->content << "\n";
         
     | 
| 
      
 16448 
     | 
    
         
            +
                            } else {
         
     | 
| 
      
 16449 
     | 
    
         
            +
                                ss << message->content << "\n\n";
         
     | 
| 
      
 16450 
     | 
    
         
            +
                            }
         
     | 
| 
      
 16451 
     | 
    
         
            +
                        } else if (role == "user") {
         
     | 
| 
      
 16452 
     | 
    
         
            +
                            ss << "USER: " << message->content << "\n";
         
     | 
| 
      
 16453 
     | 
    
         
            +
                        } else if (role == "assistant") {
         
     | 
| 
      
 16454 
     | 
    
         
            +
                            ss << "ASSISTANT: " << message->content << "</s>\n";
         
     | 
| 
      
 16455 
     | 
    
         
            +
                        }
         
     | 
| 
      
 16456 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16457 
     | 
    
         
            +
                    if (add_ass) {
         
     | 
| 
      
 16458 
     | 
    
         
            +
                        ss << "ASSISTANT:";
         
     | 
| 
      
 16459 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16460 
     | 
    
         
            +
                } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
         
     | 
| 
      
 16461 
     | 
    
         
            +
                    // deepseek-ai/deepseek-coder-33b-instruct
         
     | 
| 
      
 16462 
     | 
    
         
            +
                    for (auto message : chat) {
         
     | 
| 
      
 16463 
     | 
    
         
            +
                        std::string role(message->role);
         
     | 
| 
      
 16464 
     | 
    
         
            +
                        if (role == "system") {
         
     | 
| 
      
 16465 
     | 
    
         
            +
                            ss << message->content;
         
     | 
| 
      
 16466 
     | 
    
         
            +
                        } else if (role == "user") {
         
     | 
| 
      
 16467 
     | 
    
         
            +
                            ss << "### Instruction:\n" << message->content << "\n";
         
     | 
| 
      
 16468 
     | 
    
         
            +
                        } else if (role == "assistant") {
         
     | 
| 
      
 16469 
     | 
    
         
            +
                            ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
         
     | 
| 
      
 16470 
     | 
    
         
            +
                        }
         
     | 
| 
      
 16471 
     | 
    
         
            +
                    }
         
     | 
| 
      
 16472 
     | 
    
         
            +
                    if (add_ass) {
         
     | 
| 
      
 16473 
     | 
    
         
            +
                        ss << "### Response:\n";
         
     | 
| 
      
 16474 
     | 
    
         
            +
                    }
         
     | 
| 
       14605 
16475 
     | 
    
         
             
                } else {
         
     | 
| 
       14606 
16476 
     | 
    
         
             
                    // template not supported
         
     | 
| 
       14607 
16477 
     | 
    
         
             
                    return -1;
         
     | 
| 
         @@ -14651,6 +16521,30 @@ LLAMA_API int32_t llama_chat_apply_template( 
     | 
|
| 
       14651 
16521 
     | 
    
         
             
                return res;
         
     | 
| 
       14652 
16522 
     | 
    
         
             
            }
         
     | 
| 
       14653 
16523 
     | 
    
         | 
| 
      
 16524 
     | 
    
         
            +
            LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
         
     | 
| 
      
 16525 
     | 
    
         
            +
                static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
         
     | 
| 
      
 16526 
     | 
    
         
            +
                if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
         
     | 
| 
      
 16527 
     | 
    
         
            +
                    return strlen(split_path);
         
     | 
| 
      
 16528 
     | 
    
         
            +
                }
         
     | 
| 
      
 16529 
     | 
    
         
            +
                return 0;
         
     | 
| 
      
 16530 
     | 
    
         
            +
            }
         
     | 
| 
      
 16531 
     | 
    
         
            +
             
     | 
| 
      
 16532 
     | 
    
         
            +
            int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
         
     | 
| 
      
 16533 
     | 
    
         
            +
                std::string str_split_path(split_path);
         
     | 
| 
      
 16534 
     | 
    
         
            +
                char postfix[32];
         
     | 
| 
      
 16535 
     | 
    
         
            +
                snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
         
     | 
| 
      
 16536 
     | 
    
         
            +
                std::string str_postfix(postfix);
         
     | 
| 
      
 16537 
     | 
    
         
            +
             
     | 
| 
      
 16538 
     | 
    
         
            +
                // check if dest ends with postfix
         
     | 
| 
      
 16539 
     | 
    
         
            +
                int size_prefix = str_split_path.size() - str_postfix.size();
         
     | 
| 
      
 16540 
     | 
    
         
            +
                if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
         
     | 
| 
      
 16541 
     | 
    
         
            +
                    snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
         
     | 
| 
      
 16542 
     | 
    
         
            +
                    return size_prefix;
         
     | 
| 
      
 16543 
     | 
    
         
            +
                }
         
     | 
| 
      
 16544 
     | 
    
         
            +
             
     | 
| 
      
 16545 
     | 
    
         
            +
                return 0;
         
     | 
| 
      
 16546 
     | 
    
         
            +
            }
         
     | 
| 
      
 16547 
     | 
    
         
            +
             
     | 
| 
       14654 
16548 
     | 
    
         
             
            struct llama_timings llama_get_timings(struct llama_context * ctx) {
         
     | 
| 
       14655 
16549 
     | 
    
         
             
                struct llama_timings result = {
         
     | 
| 
       14656 
16550 
     | 
    
         
             
                    /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
         
     |