llama_cpp 0.14.4 → 0.14.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +29 -9
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +142 -49
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +156 -156
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +942 -267
- data/vendor/tmp/llama.cpp/ggml.c +161 -95
- data/vendor/tmp/llama.cpp/ggml.h +12 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1577 -274
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,6 +209,7 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
213
214
|
LLM_ARCH_PLAMO,
|
214
215
|
LLM_ARCH_CODESHELL,
|
@@ -220,6 +221,8 @@ enum llm_arch {
|
|
220
221
|
LLM_ARCH_MAMBA,
|
221
222
|
LLM_ARCH_XVERSE,
|
222
223
|
LLM_ARCH_COMMAND_R,
|
224
|
+
LLM_ARCH_DBRX,
|
225
|
+
LLM_ARCH_OLMO,
|
223
226
|
LLM_ARCH_UNKNOWN,
|
224
227
|
};
|
225
228
|
|
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
244
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
245
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
246
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
248
|
{ LLM_ARCH_PHI2, "phi2" },
|
245
249
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
250
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
256
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
257
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
261
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
262
|
};
|
257
263
|
|
@@ -261,6 +267,7 @@ enum llm_kv {
|
|
261
267
|
LLM_KV_GENERAL_ALIGNMENT,
|
262
268
|
LLM_KV_GENERAL_NAME,
|
263
269
|
LLM_KV_GENERAL_AUTHOR,
|
270
|
+
LLM_KV_GENERAL_VERSION,
|
264
271
|
LLM_KV_GENERAL_URL,
|
265
272
|
LLM_KV_GENERAL_DESCRIPTION,
|
266
273
|
LLM_KV_GENERAL_LICENSE,
|
@@ -317,11 +324,17 @@ enum llm_kv {
|
|
317
324
|
LLM_KV_TOKENIZER_UNK_ID,
|
318
325
|
LLM_KV_TOKENIZER_SEP_ID,
|
319
326
|
LLM_KV_TOKENIZER_PAD_ID,
|
327
|
+
LLM_KV_TOKENIZER_CLS_ID,
|
328
|
+
LLM_KV_TOKENIZER_MASK_ID,
|
320
329
|
LLM_KV_TOKENIZER_ADD_BOS,
|
321
330
|
LLM_KV_TOKENIZER_ADD_EOS,
|
322
331
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
323
332
|
LLM_KV_TOKENIZER_HF_JSON,
|
324
333
|
LLM_KV_TOKENIZER_RWKV,
|
334
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
335
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
336
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
337
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
325
338
|
};
|
326
339
|
|
327
340
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -330,6 +343,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
330
343
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
331
344
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
332
345
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
346
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
333
347
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
334
348
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
335
349
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
@@ -386,11 +400,17 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
400
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
387
401
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
388
402
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
403
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
404
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
389
405
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
390
406
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
391
407
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
392
408
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
393
409
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
410
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
411
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
412
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
394
414
|
};
|
395
415
|
|
396
416
|
struct LLM_KV {
|
@@ -421,6 +441,7 @@ enum llm_tensor {
|
|
421
441
|
LLM_TENSOR_ATTN_OUT_NORM,
|
422
442
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
423
443
|
LLM_TENSOR_FFN_GATE_INP,
|
444
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
424
445
|
LLM_TENSOR_FFN_NORM,
|
425
446
|
LLM_TENSOR_FFN_GATE,
|
426
447
|
LLM_TENSOR_FFN_DOWN,
|
@@ -432,6 +453,9 @@ enum llm_tensor {
|
|
432
453
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
454
|
LLM_TENSOR_FFN_GATE_EXPS,
|
434
455
|
LLM_TENSOR_FFN_UP_EXPS,
|
456
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
457
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
458
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
435
459
|
LLM_TENSOR_ATTN_Q_NORM,
|
436
460
|
LLM_TENSOR_ATTN_K_NORM,
|
437
461
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -694,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
694
718
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
695
719
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
696
720
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
721
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
722
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
697
723
|
},
|
698
724
|
},
|
699
725
|
{
|
@@ -729,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
729
755
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
730
756
|
},
|
731
757
|
},
|
758
|
+
{
|
759
|
+
LLM_ARCH_QWEN2MOE,
|
760
|
+
{
|
761
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
762
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
763
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
764
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
765
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
766
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
767
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
768
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
769
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
770
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
771
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
772
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
773
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
778
|
+
},
|
779
|
+
},
|
732
780
|
{
|
733
781
|
LLM_ARCH_PHI2,
|
734
782
|
{
|
@@ -924,6 +972,38 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
924
972
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
973
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
974
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
975
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
976
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
977
|
+
},
|
978
|
+
},
|
979
|
+
{
|
980
|
+
LLM_ARCH_DBRX,
|
981
|
+
{
|
982
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
983
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
984
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
985
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
986
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
987
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
988
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
989
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
990
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
991
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
992
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
993
|
+
},
|
994
|
+
},
|
995
|
+
{
|
996
|
+
LLM_ARCH_OLMO,
|
997
|
+
{
|
998
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
999
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1000
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1001
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1002
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1003
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1004
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1005
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1006
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
927
1007
|
},
|
928
1008
|
},
|
929
1009
|
{
|
@@ -1630,17 +1710,17 @@ static size_t llama_get_device_memory(int device) {
|
|
1630
1710
|
#if defined(GGML_USE_CUDA)
|
1631
1711
|
size_t total;
|
1632
1712
|
size_t free;
|
1633
|
-
ggml_backend_cuda_get_device_memory(device, &
|
1713
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1634
1714
|
return free;
|
1635
1715
|
#elif defined(GGML_USE_SYCL)
|
1636
1716
|
size_t total;
|
1637
1717
|
size_t free;
|
1638
|
-
ggml_backend_sycl_get_device_memory(device, &
|
1718
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1639
1719
|
return free;
|
1640
1720
|
#elif defined(GGML_USE_VULKAN)
|
1641
1721
|
size_t total;
|
1642
1722
|
size_t free;
|
1643
|
-
ggml_backend_vk_get_device_memory(device, &
|
1723
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1644
1724
|
return free;
|
1645
1725
|
#else
|
1646
1726
|
return 1;
|
@@ -1682,6 +1762,7 @@ enum e_model {
|
|
1682
1762
|
MODEL_4B,
|
1683
1763
|
MODEL_7B,
|
1684
1764
|
MODEL_8B,
|
1765
|
+
MODEL_12B,
|
1685
1766
|
MODEL_13B,
|
1686
1767
|
MODEL_14B,
|
1687
1768
|
MODEL_15B,
|
@@ -1697,6 +1778,10 @@ enum e_model {
|
|
1697
1778
|
MODEL_MEDIUM,
|
1698
1779
|
MODEL_LARGE,
|
1699
1780
|
MODEL_XL,
|
1781
|
+
MODEL_A2_7B,
|
1782
|
+
MODEL_8x7B,
|
1783
|
+
MODEL_8x22B,
|
1784
|
+
MODEL_16x12B,
|
1700
1785
|
};
|
1701
1786
|
|
1702
1787
|
static const size_t kiB = 1024;
|
@@ -1880,6 +1965,12 @@ struct llama_layer {
|
|
1880
1965
|
struct ggml_tensor * ffn_down_exps;
|
1881
1966
|
struct ggml_tensor * ffn_up_exps ;
|
1882
1967
|
|
1968
|
+
// ff shared expert (shexp)
|
1969
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1970
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1971
|
+
struct ggml_tensor * ffn_down_shexp;
|
1972
|
+
struct ggml_tensor * ffn_up_shexp;
|
1973
|
+
|
1883
1974
|
// ff bias
|
1884
1975
|
struct ggml_tensor * ffn_down_b; // b2
|
1885
1976
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2014,20 +2105,22 @@ struct llama_vocab {
|
|
2014
2105
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2015
2106
|
|
2016
2107
|
// default LLaMA special tokens
|
2017
|
-
id special_bos_id
|
2018
|
-
id special_eos_id
|
2019
|
-
id special_unk_id
|
2020
|
-
id special_sep_id
|
2021
|
-
id special_pad_id
|
2108
|
+
id special_bos_id = 1;
|
2109
|
+
id special_eos_id = 2;
|
2110
|
+
id special_unk_id = 0;
|
2111
|
+
id special_sep_id = -1;
|
2112
|
+
id special_pad_id = -1;
|
2113
|
+
id special_cls_id = -1;
|
2114
|
+
id special_mask_id = -1;
|
2022
2115
|
|
2023
2116
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2024
2117
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2025
2118
|
|
2026
2119
|
id linefeed_id = 13;
|
2027
|
-
id special_prefix_id =
|
2028
|
-
id
|
2029
|
-
id
|
2030
|
-
id special_eot_id =
|
2120
|
+
id special_prefix_id = -1;
|
2121
|
+
id special_suffix_id = -1;
|
2122
|
+
id special_middle_id = -1;
|
2123
|
+
id special_eot_id = -1;
|
2031
2124
|
|
2032
2125
|
bool add_space_prefix = true;
|
2033
2126
|
|
@@ -2175,7 +2268,7 @@ struct llama_context {
|
|
2175
2268
|
|
2176
2269
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
2270
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
-
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2271
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
2179
2272
|
|
2180
2273
|
bool logits_all = false;
|
2181
2274
|
|
@@ -3533,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3533
3626
|
case MODEL_3B: return "3B";
|
3534
3627
|
case MODEL_7B: return "7B";
|
3535
3628
|
case MODEL_8B: return "8B";
|
3629
|
+
case MODEL_12B: return "12B";
|
3536
3630
|
case MODEL_13B: return "13B";
|
3537
3631
|
case MODEL_14B: return "14B";
|
3538
3632
|
case MODEL_15B: return "15B";
|
@@ -3548,6 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3548
3642
|
case MODEL_MEDIUM: return "0.4B";
|
3549
3643
|
case MODEL_LARGE: return "0.8B";
|
3550
3644
|
case MODEL_XL: return "1.5B";
|
3645
|
+
case MODEL_A2_7B: return "A2.7B";
|
3646
|
+
case MODEL_8x7B: return "8x7B";
|
3647
|
+
case MODEL_8x22B: return "8x22B";
|
3648
|
+
case MODEL_16x12B: return "16x12B";
|
3551
3649
|
default: return "?B";
|
3552
3650
|
}
|
3553
3651
|
}
|
@@ -3662,15 +3760,23 @@ static void llm_load_hparams(
|
|
3662
3760
|
{
|
3663
3761
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3664
3762
|
|
3665
|
-
|
3666
|
-
|
3667
|
-
|
3668
|
-
|
3669
|
-
|
3670
|
-
|
3671
|
-
|
3672
|
-
|
3673
|
-
|
3763
|
+
if (hparams.n_expert == 8) {
|
3764
|
+
switch (hparams.n_layer) {
|
3765
|
+
case 32: model.type = e_model::MODEL_8x7B; break;
|
3766
|
+
case 56: model.type = e_model::MODEL_8x22B; break;
|
3767
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3768
|
+
}
|
3769
|
+
} else {
|
3770
|
+
switch (hparams.n_layer) {
|
3771
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
3772
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3774
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3775
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
3776
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
3777
|
+
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
3778
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3779
|
+
}
|
3674
3780
|
}
|
3675
3781
|
} break;
|
3676
3782
|
case LLM_ARCH_MINICPM:
|
@@ -3812,6 +3918,7 @@ static void llm_load_hparams(
|
|
3812
3918
|
switch (hparams.n_layer) {
|
3813
3919
|
case 24: model.type = e_model::MODEL_1B; break;
|
3814
3920
|
case 32: model.type = e_model::MODEL_3B; break;
|
3921
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3815
3922
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3816
3923
|
}
|
3817
3924
|
} break;
|
@@ -3836,6 +3943,14 @@ static void llm_load_hparams(
|
|
3836
3943
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3837
3944
|
}
|
3838
3945
|
} break;
|
3946
|
+
case LLM_ARCH_QWEN2MOE:
|
3947
|
+
{
|
3948
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3949
|
+
switch (hparams.n_layer) {
|
3950
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3951
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3952
|
+
}
|
3953
|
+
} break;
|
3839
3954
|
case LLM_ARCH_PHI2:
|
3840
3955
|
{
|
3841
3956
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3961,6 +4076,28 @@ static void llm_load_hparams(
|
|
3961
4076
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3962
4077
|
}
|
3963
4078
|
} break;
|
4079
|
+
case LLM_ARCH_DBRX:
|
4080
|
+
{
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4082
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4083
|
+
|
4084
|
+
switch (hparams.n_layer) {
|
4085
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4087
|
+
}
|
4088
|
+
} break;
|
4089
|
+
case LLM_ARCH_OLMO:
|
4090
|
+
{
|
4091
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4092
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4093
|
+
|
4094
|
+
switch (hparams.n_layer) {
|
4095
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4096
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4097
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4098
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4099
|
+
}
|
4100
|
+
} break;
|
3964
4101
|
default: (void)0;
|
3965
4102
|
}
|
3966
4103
|
|
@@ -3974,7 +4111,9 @@ static void llm_load_hparams(
|
|
3974
4111
|
}
|
3975
4112
|
|
3976
4113
|
// TODO: This should probably be in llama.h
|
3977
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4114
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4115
|
+
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
4116
|
+
);
|
3978
4117
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
3979
4118
|
|
3980
4119
|
static void llm_load_vocab(
|
@@ -3996,23 +4135,53 @@ static void llm_load_vocab(
|
|
3996
4135
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3997
4136
|
|
3998
4137
|
// default special tokens
|
3999
|
-
vocab.special_bos_id
|
4000
|
-
vocab.special_eos_id
|
4001
|
-
vocab.special_unk_id
|
4002
|
-
vocab.special_sep_id
|
4003
|
-
vocab.special_pad_id
|
4004
|
-
vocab.
|
4138
|
+
vocab.special_bos_id = -1;
|
4139
|
+
vocab.special_eos_id = -1;
|
4140
|
+
vocab.special_unk_id = -1;
|
4141
|
+
vocab.special_sep_id = -1;
|
4142
|
+
vocab.special_pad_id = -1;
|
4143
|
+
vocab.special_cls_id = -1;
|
4144
|
+
vocab.special_mask_id = -1;
|
4145
|
+
vocab.linefeed_id = -1;
|
4005
4146
|
|
4006
4147
|
return;
|
4007
4148
|
} else if (tokenizer_name == "llama") {
|
4008
4149
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4009
4150
|
|
4010
4151
|
// default special tokens
|
4011
|
-
vocab.special_bos_id
|
4012
|
-
vocab.special_eos_id
|
4013
|
-
vocab.special_unk_id
|
4014
|
-
vocab.special_sep_id
|
4015
|
-
vocab.special_pad_id
|
4152
|
+
vocab.special_bos_id = 1;
|
4153
|
+
vocab.special_eos_id = 2;
|
4154
|
+
vocab.special_unk_id = 0;
|
4155
|
+
vocab.special_sep_id = -1;
|
4156
|
+
vocab.special_pad_id = -1;
|
4157
|
+
vocab.special_cls_id = -1;
|
4158
|
+
vocab.special_mask_id = -1;
|
4159
|
+
|
4160
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4161
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4162
|
+
// will allow those models to continue to work. The general names
|
4163
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4164
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4165
|
+
// new versions of these models have been published.
|
4166
|
+
std::string gen_name;
|
4167
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4168
|
+
|
4169
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4170
|
+
[](unsigned char c){ return std::tolower(c); });
|
4171
|
+
|
4172
|
+
if (gen_name.find("code") != std::string::npos) {
|
4173
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4174
|
+
vocab.special_prefix_id = 32007;
|
4175
|
+
vocab.special_suffix_id = 32008;
|
4176
|
+
vocab.special_middle_id = 32009;
|
4177
|
+
vocab.special_eot_id = 32010;
|
4178
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4179
|
+
vocab.special_prefix_id = 67;
|
4180
|
+
vocab.special_suffix_id = 69;
|
4181
|
+
vocab.special_middle_id = 68;
|
4182
|
+
vocab.special_eot_id = 70;
|
4183
|
+
}
|
4184
|
+
}
|
4016
4185
|
|
4017
4186
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4018
4187
|
if (add_space_prefix_keyidx != -1) {
|
@@ -4047,20 +4216,24 @@ static void llm_load_vocab(
|
|
4047
4216
|
}
|
4048
4217
|
|
4049
4218
|
// default special tokens
|
4050
|
-
vocab.special_bos_id
|
4051
|
-
vocab.special_eos_id
|
4052
|
-
vocab.special_unk_id
|
4053
|
-
vocab.special_sep_id
|
4054
|
-
vocab.special_pad_id
|
4219
|
+
vocab.special_bos_id = 11;
|
4220
|
+
vocab.special_eos_id = 11;
|
4221
|
+
vocab.special_unk_id = -1;
|
4222
|
+
vocab.special_sep_id = -1;
|
4223
|
+
vocab.special_pad_id = -1;
|
4224
|
+
vocab.special_cls_id = -1;
|
4225
|
+
vocab.special_mask_id = -1;
|
4055
4226
|
} else if (tokenizer_name == "bert") {
|
4056
4227
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4057
4228
|
|
4058
4229
|
// default special tokens
|
4059
|
-
vocab.special_bos_id
|
4060
|
-
vocab.special_eos_id
|
4061
|
-
vocab.special_unk_id
|
4062
|
-
vocab.special_sep_id
|
4063
|
-
vocab.special_pad_id
|
4230
|
+
vocab.special_bos_id = -1;
|
4231
|
+
vocab.special_eos_id = -1;
|
4232
|
+
vocab.special_unk_id = 100;
|
4233
|
+
vocab.special_sep_id = 102;
|
4234
|
+
vocab.special_pad_id = 0;
|
4235
|
+
vocab.special_cls_id = 101;
|
4236
|
+
vocab.special_mask_id = 103;
|
4064
4237
|
vocab.add_space_prefix = false;
|
4065
4238
|
} else {
|
4066
4239
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
@@ -4123,11 +4296,17 @@ static void llm_load_vocab(
|
|
4123
4296
|
// special tokens
|
4124
4297
|
{
|
4125
4298
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4126
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4127
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4128
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4129
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4130
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4299
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4300
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4301
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4302
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4303
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4304
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4305
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4306
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4307
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4308
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4131
4310
|
};
|
4132
4311
|
for (const auto & it : special_token_types) {
|
4133
4312
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4319,12 +4498,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4319
4498
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4320
4499
|
|
4321
4500
|
// special tokens
|
4322
|
-
if (vocab.special_bos_id
|
4323
|
-
if (vocab.special_eos_id
|
4324
|
-
if (vocab.special_unk_id
|
4325
|
-
if (vocab.special_sep_id
|
4326
|
-
if (vocab.special_pad_id
|
4327
|
-
if (vocab.
|
4501
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4502
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4503
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4504
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4505
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4506
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4507
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4508
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4328
4509
|
}
|
4329
4510
|
|
4330
4511
|
// Returns false if cancelled by progress_callback
|
@@ -4342,6 +4523,13 @@ static bool llm_load_tensors(
|
|
4342
4523
|
|
4343
4524
|
auto & hparams = model.hparams;
|
4344
4525
|
|
4526
|
+
#ifdef GGML_USE_SYCL
|
4527
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4528
|
+
if (hparams.n_expert > 0) {
|
4529
|
+
n_gpu_layers = 0;
|
4530
|
+
}
|
4531
|
+
#endif
|
4532
|
+
|
4345
4533
|
model.split_mode = split_mode;
|
4346
4534
|
model.main_gpu = main_gpu;
|
4347
4535
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4439,7 +4627,7 @@ static bool llm_load_tensors(
|
|
4439
4627
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
4628
|
|
4441
4629
|
// for moe merged tensors
|
4442
|
-
ctx_size += ggml_tensor_overhead()*
|
4630
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4443
4631
|
|
4444
4632
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4445
4633
|
for (auto & it : buft_layer_count) {
|
@@ -4635,6 +4823,39 @@ static bool llm_load_tensors(
|
|
4635
4823
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4636
4824
|
}
|
4637
4825
|
} break;
|
4826
|
+
case LLM_ARCH_DBRX:
|
4827
|
+
{
|
4828
|
+
if (n_expert == 0) {
|
4829
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4830
|
+
}
|
4831
|
+
|
4832
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
|
4834
|
+
// output
|
4835
|
+
{
|
4836
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4837
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
for (int i = 0; i < n_layer; ++i) {
|
4841
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4842
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4843
|
+
|
4844
|
+
auto & layer = model.layers[i];
|
4845
|
+
|
4846
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4849
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4850
|
+
|
4851
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4852
|
+
|
4853
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4854
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4855
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4856
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4857
|
+
}
|
4858
|
+
} break;
|
4638
4859
|
case LLM_ARCH_BAICHUAN:
|
4639
4860
|
{
|
4640
4861
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4949,8 +5170,13 @@ static bool llm_load_tensors(
|
|
4949
5170
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4950
5171
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4951
5172
|
|
4952
|
-
|
4953
|
-
layer.
|
5173
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5174
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5175
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5176
|
+
|
5177
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5178
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5179
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4954
5180
|
|
4955
5181
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4956
5182
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -4993,7 +5219,13 @@ static bool llm_load_tensors(
|
|
4993
5219
|
// output
|
4994
5220
|
{
|
4995
5221
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4996
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5222
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5223
|
+
// if output is NULL, init from the input tok embed
|
5224
|
+
if (model.output == NULL) {
|
5225
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5226
|
+
ml.n_created--; // artificial tensor
|
5227
|
+
ml.size_data += ggml_nbytes(model.output);
|
5228
|
+
}
|
4997
5229
|
}
|
4998
5230
|
|
4999
5231
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5021,6 +5253,54 @@ static bool llm_load_tensors(
|
|
5021
5253
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5022
5254
|
}
|
5023
5255
|
} break;
|
5256
|
+
case LLM_ARCH_QWEN2MOE:
|
5257
|
+
{
|
5258
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5259
|
+
|
5260
|
+
// output
|
5261
|
+
{
|
5262
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5263
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5264
|
+
}
|
5265
|
+
|
5266
|
+
for (int i = 0; i < n_layer; ++i) {
|
5267
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5268
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5269
|
+
|
5270
|
+
auto & layer = model.layers[i];
|
5271
|
+
|
5272
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5273
|
+
|
5274
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5275
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5276
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5277
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5278
|
+
|
5279
|
+
// optional bias tensors
|
5280
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5281
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5282
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5283
|
+
|
5284
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5285
|
+
|
5286
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5287
|
+
|
5288
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5289
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5290
|
+
|
5291
|
+
// MoE branch
|
5292
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5293
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5294
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5295
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5296
|
+
|
5297
|
+
// Shared expert branch
|
5298
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5299
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5300
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5301
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5302
|
+
}
|
5303
|
+
} break;
|
5024
5304
|
case LLM_ARCH_PHI2:
|
5025
5305
|
{
|
5026
5306
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5404,11 +5684,47 @@ static bool llm_load_tensors(
|
|
5404
5684
|
|
5405
5685
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
5686
|
|
5687
|
+
if (n_layer >= 64){
|
5688
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
5689
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
5690
|
+
}
|
5691
|
+
|
5692
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5693
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5694
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5695
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5696
|
+
|
5697
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5698
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5699
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5700
|
+
}
|
5701
|
+
} break;
|
5702
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5703
|
+
{
|
5704
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5705
|
+
|
5706
|
+
// output
|
5707
|
+
{
|
5708
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5709
|
+
// if output is NULL, init from the input tok embed
|
5710
|
+
if (model.output == NULL) {
|
5711
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5712
|
+
ml.n_created--; // artificial tensor
|
5713
|
+
ml.size_data += ggml_nbytes(model.output);
|
5714
|
+
}
|
5715
|
+
}
|
5716
|
+
|
5717
|
+
for (int i = 0; i < n_layer; ++i) {
|
5718
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5719
|
+
|
5720
|
+
auto & layer = model.layers[i];
|
5721
|
+
|
5407
5722
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
5723
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
5724
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5410
5725
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5411
5726
|
|
5727
|
+
|
5412
5728
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5413
5729
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5414
5730
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5849,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5849
6165
|
return cur;
|
5850
6166
|
}
|
5851
6167
|
|
6168
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6169
|
+
struct ggml_context * ctx,
|
6170
|
+
struct ggml_tensor * cur,
|
6171
|
+
struct ggml_tensor * gate_inp,
|
6172
|
+
struct ggml_tensor * up_exps,
|
6173
|
+
struct ggml_tensor * gate_exps,
|
6174
|
+
struct ggml_tensor * down_exps,
|
6175
|
+
int64_t n_expert,
|
6176
|
+
int64_t n_expert_used,
|
6177
|
+
llm_ffn_op_type type_op,
|
6178
|
+
bool norm_w,
|
6179
|
+
const llm_build_cb & cb,
|
6180
|
+
int il) {
|
6181
|
+
int64_t n_embd = cur->ne[0];
|
6182
|
+
int64_t n_tokens = cur->ne[1];
|
6183
|
+
|
6184
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6185
|
+
cb(logits, "ffn_moe_logits", il);
|
6186
|
+
|
6187
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6188
|
+
cb(probs, "ffn_moe_probs", il);
|
6189
|
+
|
6190
|
+
// select experts
|
6191
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6192
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6193
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6194
|
+
|
6195
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6196
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6197
|
+
cb(weights, "ffn_moe_weights", il);
|
6198
|
+
|
6199
|
+
if (norm_w) {
|
6200
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6201
|
+
|
6202
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6203
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6204
|
+
|
6205
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6206
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6207
|
+
|
6208
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6209
|
+
}
|
6210
|
+
|
6211
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6212
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6213
|
+
cb(up, "ffn_moe_up", il);
|
6214
|
+
|
6215
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6216
|
+
cb(gate, "ffn_moe_gate", il);
|
6217
|
+
|
6218
|
+
switch (type_op) {
|
6219
|
+
case LLM_FFN_SILU:
|
6220
|
+
{
|
6221
|
+
gate = ggml_silu(ctx, gate);
|
6222
|
+
cb(gate, "ffn_moe_silu", il);
|
6223
|
+
} break;
|
6224
|
+
case LLM_FFN_GELU:
|
6225
|
+
{
|
6226
|
+
gate = ggml_gelu(ctx, gate);
|
6227
|
+
cb(gate, "ffn_moe_gelu", il);
|
6228
|
+
} break;
|
6229
|
+
default:
|
6230
|
+
GGML_ASSERT(false);
|
6231
|
+
}
|
6232
|
+
|
6233
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6234
|
+
cb(par, "ffn_moe_gate_par", il);
|
6235
|
+
|
6236
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6237
|
+
cb(experts, "ffn_moe_down", il);
|
6238
|
+
|
6239
|
+
experts = ggml_mul(ctx, experts, weights);
|
6240
|
+
|
6241
|
+
// aggregate experts
|
6242
|
+
ggml_tensor * moe_out = nullptr;
|
6243
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6244
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6245
|
+
experts->nb[2], i*experts->nb[1]);
|
6246
|
+
|
6247
|
+
if (i == 0) {
|
6248
|
+
moe_out = cur_expert;
|
6249
|
+
} else {
|
6250
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6251
|
+
}
|
6252
|
+
}
|
6253
|
+
|
6254
|
+
if (n_expert_used == 1) {
|
6255
|
+
// avoid returning a non-contiguous tensor
|
6256
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6257
|
+
}
|
6258
|
+
|
6259
|
+
return moe_out;
|
6260
|
+
}
|
6261
|
+
|
5852
6262
|
// if max_alibi_bias > 0 then apply ALiBi
|
5853
6263
|
static struct ggml_tensor * llm_build_kqv(
|
5854
6264
|
struct ggml_context * ctx,
|
@@ -6392,62 +6802,15 @@ struct llm_build_context {
|
|
6392
6802
|
LLM_NORM_RMS, cb, il);
|
6393
6803
|
cb(cur, "ffn_norm", il);
|
6394
6804
|
|
6395
|
-
|
6396
|
-
|
6397
|
-
|
6398
|
-
|
6399
|
-
|
6400
|
-
|
6401
|
-
|
6402
|
-
|
6403
|
-
cb(
|
6404
|
-
|
6405
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6406
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6407
|
-
cb(weights, "ffn_moe_weights", il);
|
6408
|
-
|
6409
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6410
|
-
|
6411
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6412
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6413
|
-
|
6414
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6415
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6416
|
-
|
6417
|
-
// compute expert outputs
|
6418
|
-
ggml_tensor * moe_out = nullptr;
|
6419
|
-
|
6420
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6421
|
-
ggml_tensor * cur_expert;
|
6422
|
-
|
6423
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6424
|
-
cb(cur_up, "ffn_moe_up", il);
|
6425
|
-
|
6426
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6427
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6428
|
-
|
6429
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6430
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6431
|
-
|
6432
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6433
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6434
|
-
|
6435
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6436
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6437
|
-
|
6438
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6439
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6440
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6441
|
-
|
6442
|
-
if (i == 0) {
|
6443
|
-
moe_out = cur_expert;
|
6444
|
-
} else {
|
6445
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6446
|
-
cb(moe_out, "ffn_moe_out", il);
|
6447
|
-
}
|
6448
|
-
}
|
6449
|
-
|
6450
|
-
cur = moe_out;
|
6805
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6806
|
+
model.layers[il].ffn_gate_inp,
|
6807
|
+
model.layers[il].ffn_up_exps,
|
6808
|
+
model.layers[il].ffn_gate_exps,
|
6809
|
+
model.layers[il].ffn_down_exps,
|
6810
|
+
n_expert, n_expert_used,
|
6811
|
+
LLM_FFN_SILU, true,
|
6812
|
+
cb, il);
|
6813
|
+
cb(cur, "ffn_moe_out", il);
|
6451
6814
|
}
|
6452
6815
|
|
6453
6816
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -6926,63 +7289,15 @@ struct llm_build_context {
|
|
6926
7289
|
LLM_NORM_RMS, cb, il);
|
6927
7290
|
cb(cur, "ffn_norm", il);
|
6928
7291
|
|
6929
|
-
|
6930
|
-
|
6931
|
-
|
6932
|
-
|
6933
|
-
|
6934
|
-
|
6935
|
-
|
6936
|
-
|
6937
|
-
cb(
|
6938
|
-
|
6939
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
-
cb(weights, "ffn_moe_weights", il);
|
6942
|
-
|
6943
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
-
|
6945
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
-
|
6948
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
-
|
6951
|
-
// compute expert outputs
|
6952
|
-
ggml_tensor * moe_out = nullptr;
|
6953
|
-
|
6954
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
-
ggml_tensor * cur_expert;
|
6956
|
-
|
6957
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
-
cb(cur_up, "ffn_moe_up", il);
|
6959
|
-
|
6960
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
-
|
6963
|
-
//GeLU
|
6964
|
-
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
-
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
-
|
6967
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
-
|
6970
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
-
|
6973
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
-
|
6977
|
-
if (i == 0) {
|
6978
|
-
moe_out = cur_expert;
|
6979
|
-
} else {
|
6980
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
-
cb(moe_out, "ffn_moe_out", il);
|
6982
|
-
}
|
6983
|
-
}
|
6984
|
-
|
6985
|
-
cur = moe_out;
|
7292
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7293
|
+
model.layers[il].ffn_gate_inp,
|
7294
|
+
model.layers[il].ffn_up_exps,
|
7295
|
+
model.layers[il].ffn_gate_exps,
|
7296
|
+
model.layers[il].ffn_down_exps,
|
7297
|
+
n_expert, n_expert_used,
|
7298
|
+
LLM_FFN_GELU, true,
|
7299
|
+
cb, il);
|
7300
|
+
cb(cur, "ffn_moe_out", il);
|
6986
7301
|
|
6987
7302
|
// Grok
|
6988
7303
|
// if layer_out_norm is present then apply it before adding the input
|
@@ -6994,7 +7309,6 @@ struct llm_build_context {
|
|
6994
7309
|
cb(cur, "layer_out_norm", il);
|
6995
7310
|
}
|
6996
7311
|
|
6997
|
-
|
6998
7312
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
7313
|
cb(cur, "ffn_out", il);
|
7000
7314
|
|
@@ -7030,12 +7344,16 @@ struct llm_build_context {
|
|
7030
7344
|
return gf;
|
7031
7345
|
}
|
7032
7346
|
|
7033
|
-
struct ggml_cgraph *
|
7347
|
+
struct ggml_cgraph * build_dbrx() {
|
7034
7348
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7035
7349
|
|
7350
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7351
|
+
int32_t n_tokens = this->n_tokens;
|
7352
|
+
|
7036
7353
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7037
7354
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7038
7355
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7356
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7039
7357
|
|
7040
7358
|
struct ggml_tensor * cur;
|
7041
7359
|
struct ggml_tensor * inpL;
|
@@ -7048,16 +7366,140 @@ struct llm_build_context {
|
|
7048
7366
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7049
7367
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7050
7368
|
|
7051
|
-
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7052
|
-
cb(pos, "pos_embd", -1);
|
7053
|
-
|
7054
|
-
inpL = ggml_add(ctx0, inpL, pos);
|
7055
|
-
cb(inpL, "inpL", -1);
|
7056
|
-
|
7057
7369
|
for (int il = 0; il < n_layer; ++il) {
|
7370
|
+
struct ggml_tensor * inpSA = inpL;
|
7371
|
+
|
7372
|
+
// norm
|
7058
7373
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7059
|
-
|
7060
|
-
|
7374
|
+
model.layers[il].attn_norm, NULL,
|
7375
|
+
LLM_NORM, cb, il);
|
7376
|
+
cb(cur, "attn_norm", il);
|
7377
|
+
|
7378
|
+
// self-attention
|
7379
|
+
{
|
7380
|
+
struct ggml_tensor * Qcur = nullptr;
|
7381
|
+
struct ggml_tensor * Kcur = nullptr;
|
7382
|
+
struct ggml_tensor * Vcur = nullptr;
|
7383
|
+
|
7384
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7385
|
+
cb(cur, "wqkv", il);
|
7386
|
+
|
7387
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7388
|
+
cb(cur, "wqkv_clamped", il);
|
7389
|
+
|
7390
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7391
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7392
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7393
|
+
|
7394
|
+
cb(Qcur, "Qcur", il);
|
7395
|
+
cb(Kcur, "Kcur", il);
|
7396
|
+
cb(Vcur, "Vcur", il);
|
7397
|
+
|
7398
|
+
Qcur = ggml_rope_custom(
|
7399
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7402
|
+
);
|
7403
|
+
cb(Qcur, "Qcur", il);
|
7404
|
+
|
7405
|
+
Kcur = ggml_rope_custom(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7407
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7408
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
|
+
);
|
7410
|
+
cb(Kcur, "Kcur", il);
|
7411
|
+
|
7412
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
+
model.layers[il].wo, NULL,
|
7414
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7415
|
+
}
|
7416
|
+
|
7417
|
+
if (il == n_layer - 1) {
|
7418
|
+
// skip computing output for unused tokens
|
7419
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7420
|
+
n_tokens = n_outputs;
|
7421
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7422
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7423
|
+
}
|
7424
|
+
|
7425
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7426
|
+
cb(ffn_inp, "ffn_inp", il);
|
7427
|
+
|
7428
|
+
// feed-forward network
|
7429
|
+
// MoE branch
|
7430
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7431
|
+
model.layers[il].attn_out_norm, NULL,
|
7432
|
+
LLM_NORM, cb, il);
|
7433
|
+
cb(cur, "attn_out_norm", il);
|
7434
|
+
|
7435
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7436
|
+
model.layers[il].ffn_gate_inp,
|
7437
|
+
model.layers[il].ffn_up_exps,
|
7438
|
+
model.layers[il].ffn_gate_exps,
|
7439
|
+
model.layers[il].ffn_down_exps,
|
7440
|
+
n_expert, n_expert_used,
|
7441
|
+
LLM_FFN_SILU, true,
|
7442
|
+
cb, il);
|
7443
|
+
cb(cur, "ffn_moe_out", il);
|
7444
|
+
|
7445
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7446
|
+
cb(cur, "ffn_out", il);
|
7447
|
+
|
7448
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7449
|
+
if (layer_dir != nullptr) {
|
7450
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7451
|
+
}
|
7452
|
+
cb(cur, "l_out", il);
|
7453
|
+
|
7454
|
+
// input for next layer
|
7455
|
+
inpL = cur;
|
7456
|
+
}
|
7457
|
+
|
7458
|
+
cur = inpL;
|
7459
|
+
|
7460
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7461
|
+
model.output_norm, NULL,
|
7462
|
+
LLM_NORM, cb, -1);
|
7463
|
+
cb(cur, "result_norm", -1);
|
7464
|
+
|
7465
|
+
// lm_head
|
7466
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7467
|
+
|
7468
|
+
cb(cur, "result_output", -1);
|
7469
|
+
|
7470
|
+
ggml_build_forward_expand(gf, cur);
|
7471
|
+
|
7472
|
+
return gf;
|
7473
|
+
}
|
7474
|
+
|
7475
|
+
struct ggml_cgraph * build_starcoder() {
|
7476
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7477
|
+
|
7478
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7479
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7480
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7481
|
+
|
7482
|
+
struct ggml_tensor * cur;
|
7483
|
+
struct ggml_tensor * inpL;
|
7484
|
+
|
7485
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7486
|
+
|
7487
|
+
// inp_pos - contains the positions
|
7488
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7489
|
+
|
7490
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7491
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7492
|
+
|
7493
|
+
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7494
|
+
cb(pos, "pos_embd", -1);
|
7495
|
+
|
7496
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7497
|
+
cb(inpL, "inpL", -1);
|
7498
|
+
|
7499
|
+
for (int il = 0; il < n_layer; ++il) {
|
7500
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7501
|
+
model.layers[il].attn_norm,
|
7502
|
+
model.layers[il].attn_norm_b,
|
7061
7503
|
LLM_NORM, cb, il);
|
7062
7504
|
cb(cur, "attn_norm", il);
|
7063
7505
|
|
@@ -7882,7 +8324,7 @@ struct llm_build_context {
|
|
7882
8324
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7883
8325
|
|
7884
8326
|
for (int il = 0; il < n_layer; ++il) {
|
7885
|
-
|
8327
|
+
|
7886
8328
|
|
7887
8329
|
// norm
|
7888
8330
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7891,6 +8333,8 @@ struct llm_build_context {
|
|
7891
8333
|
LLM_NORM, cb, il);
|
7892
8334
|
cb(cur, "attn_norm", il);
|
7893
8335
|
|
8336
|
+
struct ggml_tensor * inpSA = cur;
|
8337
|
+
|
7894
8338
|
// self-attention
|
7895
8339
|
{
|
7896
8340
|
// compute Q and K and RoPE them
|
@@ -7915,15 +8359,36 @@ struct llm_build_context {
|
|
7915
8359
|
cb(Vcur, "Vcur", il);
|
7916
8360
|
}
|
7917
8361
|
|
8362
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8363
|
+
cb(Qcur, "Qcur", il);
|
8364
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8365
|
+
cb(Kcur, "Kcur", il);
|
8366
|
+
|
8367
|
+
if (model.layers[il].attn_q_norm) {
|
8368
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8369
|
+
model.layers[il].attn_q_norm,
|
8370
|
+
NULL,
|
8371
|
+
LLM_NORM, cb, il);
|
8372
|
+
cb(Qcur, "Qcur", il);
|
8373
|
+
}
|
8374
|
+
if (model.layers[il].attn_k_norm) {
|
8375
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8376
|
+
model.layers[il].attn_k_norm,
|
8377
|
+
NULL,
|
8378
|
+
LLM_NORM, cb, il);
|
8379
|
+
cb(Kcur, "Kcur", il);
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
|
7918
8383
|
Qcur = ggml_rope_custom(
|
7919
|
-
ctx0,
|
8384
|
+
ctx0, Qcur, inp_pos,
|
7920
8385
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7921
8386
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7922
8387
|
);
|
7923
8388
|
cb(Qcur, "Qcur", il);
|
7924
8389
|
|
7925
8390
|
Kcur = ggml_rope_custom(
|
7926
|
-
ctx0,
|
8391
|
+
ctx0, Kcur, inp_pos,
|
7927
8392
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7928
8393
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7929
8394
|
);
|
@@ -7938,20 +8403,25 @@ struct llm_build_context {
|
|
7938
8403
|
// skip computing output for unused tokens
|
7939
8404
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
8405
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8406
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7941
8407
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
8408
|
}
|
7943
8409
|
|
7944
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8410
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7945
8411
|
cb(ffn_inp, "ffn_inp", il);
|
7946
8412
|
|
7947
8413
|
// feed-forward network
|
7948
8414
|
{
|
7949
|
-
|
7950
|
-
|
7951
|
-
|
7952
|
-
|
7953
|
-
|
7954
|
-
|
8415
|
+
if (model.layers[il].ffn_norm) {
|
8416
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8417
|
+
model.layers[il].ffn_norm,
|
8418
|
+
model.layers[il].ffn_norm_b,
|
8419
|
+
LLM_NORM, cb, il);
|
8420
|
+
cb(cur, "ffn_norm", il);
|
8421
|
+
} else {
|
8422
|
+
// parallel residual
|
8423
|
+
cur = inpSA;
|
8424
|
+
}
|
7955
8425
|
cur = llm_build_ffn(ctx0, cur,
|
7956
8426
|
model.layers[il].ffn_up, NULL,
|
7957
8427
|
model.layers[il].ffn_gate, NULL,
|
@@ -8141,12 +8611,6 @@ struct llm_build_context {
|
|
8141
8611
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8142
8612
|
cb(Vcur, "Vcur", il);
|
8143
8613
|
|
8144
|
-
// these nodes are added to the graph together so that they are not reordered
|
8145
|
-
// by doing so, the number of splits in the graph is reduced
|
8146
|
-
ggml_build_forward_expand(gf, Qcur);
|
8147
|
-
ggml_build_forward_expand(gf, Kcur);
|
8148
|
-
ggml_build_forward_expand(gf, Vcur);
|
8149
|
-
|
8150
8614
|
Qcur = ggml_rope_custom(
|
8151
8615
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8152
8616
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8213,6 +8677,150 @@ struct llm_build_context {
|
|
8213
8677
|
return gf;
|
8214
8678
|
}
|
8215
8679
|
|
8680
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8681
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8682
|
+
|
8683
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8684
|
+
int32_t n_tokens = this->n_tokens;
|
8685
|
+
|
8686
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8687
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8688
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8689
|
+
|
8690
|
+
struct ggml_tensor * cur;
|
8691
|
+
struct ggml_tensor * inpL;
|
8692
|
+
|
8693
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8694
|
+
|
8695
|
+
// inp_pos - contains the positions
|
8696
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8697
|
+
|
8698
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8699
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8700
|
+
|
8701
|
+
for (int il = 0; il < n_layer; ++il) {
|
8702
|
+
struct ggml_tensor * inpSA = inpL;
|
8703
|
+
|
8704
|
+
// norm
|
8705
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8706
|
+
model.layers[il].attn_norm, NULL,
|
8707
|
+
LLM_NORM_RMS, cb, il);
|
8708
|
+
cb(cur, "attn_norm", il);
|
8709
|
+
|
8710
|
+
// self_attention
|
8711
|
+
{
|
8712
|
+
// compute Q and K and RoPE them
|
8713
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8714
|
+
cb(Qcur, "Qcur", il);
|
8715
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8716
|
+
cb(Qcur, "Qcur", il);
|
8717
|
+
|
8718
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8719
|
+
cb(Kcur, "Kcur", il);
|
8720
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8721
|
+
cb(Kcur, "Kcur", il);
|
8722
|
+
|
8723
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8724
|
+
cb(Vcur, "Vcur", il);
|
8725
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8726
|
+
cb(Vcur, "Vcur", il);
|
8727
|
+
|
8728
|
+
Qcur = ggml_rope_custom(
|
8729
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8730
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8731
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8732
|
+
);
|
8733
|
+
cb(Qcur, "Qcur", il);
|
8734
|
+
|
8735
|
+
Kcur = ggml_rope_custom(
|
8736
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8737
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8738
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8739
|
+
);
|
8740
|
+
cb(Kcur, "Kcur", il);
|
8741
|
+
|
8742
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8743
|
+
model.layers[il].wo, model.layers[il].bo,
|
8744
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
|
+
}
|
8746
|
+
|
8747
|
+
if (il == n_layer - 1) {
|
8748
|
+
// skip computing output for unused tokens
|
8749
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8750
|
+
n_tokens = n_outputs;
|
8751
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8752
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8753
|
+
}
|
8754
|
+
|
8755
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8756
|
+
cb(ffn_inp, "ffn_inp", il);
|
8757
|
+
|
8758
|
+
// MoE branch
|
8759
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8760
|
+
model.layers[il].ffn_norm, NULL,
|
8761
|
+
LLM_NORM_RMS, cb, il);
|
8762
|
+
cb(cur, "ffn_norm", il);
|
8763
|
+
|
8764
|
+
ggml_tensor * moe_out =
|
8765
|
+
llm_build_moe_ffn(ctx0, cur,
|
8766
|
+
model.layers[il].ffn_gate_inp,
|
8767
|
+
model.layers[il].ffn_up_exps,
|
8768
|
+
model.layers[il].ffn_gate_exps,
|
8769
|
+
model.layers[il].ffn_down_exps,
|
8770
|
+
n_expert, n_expert_used,
|
8771
|
+
LLM_FFN_SILU, false,
|
8772
|
+
cb, il);
|
8773
|
+
cb(cur, "ffn_moe_out", il);
|
8774
|
+
|
8775
|
+
// FFN shared expert
|
8776
|
+
{
|
8777
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8778
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8779
|
+
|
8780
|
+
// sigmoid
|
8781
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8782
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8783
|
+
|
8784
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8785
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8786
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8787
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8788
|
+
NULL,
|
8789
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8790
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8791
|
+
|
8792
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8793
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8794
|
+
|
8795
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8796
|
+
cb(moe_out, "ffn_out", il);
|
8797
|
+
|
8798
|
+
cur = moe_out;
|
8799
|
+
}
|
8800
|
+
|
8801
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8802
|
+
cb(cur, "l_out", il);
|
8803
|
+
|
8804
|
+
// input for next layer
|
8805
|
+
inpL = cur;
|
8806
|
+
}
|
8807
|
+
|
8808
|
+
cur = inpL;
|
8809
|
+
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8811
|
+
model.output_norm, NULL,
|
8812
|
+
LLM_NORM_RMS, cb, -1);
|
8813
|
+
cb(cur, "result_norm", -1);
|
8814
|
+
|
8815
|
+
// lm_head
|
8816
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8817
|
+
cb(cur, "result_output", -1);
|
8818
|
+
|
8819
|
+
ggml_build_forward_expand(gf, cur);
|
8820
|
+
|
8821
|
+
return gf;
|
8822
|
+
}
|
8823
|
+
|
8216
8824
|
struct ggml_cgraph * build_phi2() {
|
8217
8825
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8218
8826
|
|
@@ -9452,6 +10060,31 @@ struct llm_build_context {
|
|
9452
10060
|
cb(Vcur, "Vcur", il);
|
9453
10061
|
}
|
9454
10062
|
|
10063
|
+
if (model.layers[il].attn_q_norm) {
|
10064
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
10065
|
+
ggml_element_size(Qcur) * n_embd_head,
|
10066
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
10067
|
+
0);
|
10068
|
+
cb(Qcur, "Qcur", il);
|
10069
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
10070
|
+
ggml_element_size(Kcur) * n_embd_head,
|
10071
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
10072
|
+
0);
|
10073
|
+
cb(Kcur, "Kcur", il);
|
10074
|
+
|
10075
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
10076
|
+
model.layers[il].attn_q_norm,
|
10077
|
+
NULL,
|
10078
|
+
LLM_NORM, cb, il);
|
10079
|
+
cb(Qcur, "Qcur", il);
|
10080
|
+
|
10081
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10082
|
+
model.layers[il].attn_k_norm,
|
10083
|
+
NULL,
|
10084
|
+
LLM_NORM, cb, il);
|
10085
|
+
cb(Kcur, "Kcur", il);
|
10086
|
+
}
|
10087
|
+
|
9455
10088
|
Qcur = ggml_rope_custom(
|
9456
10089
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
10090
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -9522,6 +10155,139 @@ struct llm_build_context {
|
|
9522
10155
|
return gf;
|
9523
10156
|
|
9524
10157
|
}
|
10158
|
+
|
10159
|
+
// ref: https://allenai.org/olmo
|
10160
|
+
// based on the original build_llama() function, changes:
|
10161
|
+
// * non-parametric layer norm
|
10162
|
+
// * clamp qkv
|
10163
|
+
// * removed bias
|
10164
|
+
// * removed MoE
|
10165
|
+
struct ggml_cgraph * build_olmo() {
|
10166
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10167
|
+
|
10168
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10169
|
+
int32_t n_tokens = this->n_tokens;
|
10170
|
+
|
10171
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10172
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10173
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10174
|
+
|
10175
|
+
struct ggml_tensor * cur;
|
10176
|
+
struct ggml_tensor * inpL;
|
10177
|
+
|
10178
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10179
|
+
|
10180
|
+
// inp_pos - contains the positions
|
10181
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10182
|
+
|
10183
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10184
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10185
|
+
|
10186
|
+
for (int il = 0; il < n_layer; ++il) {
|
10187
|
+
struct ggml_tensor * inpSA = inpL;
|
10188
|
+
|
10189
|
+
// norm
|
10190
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10191
|
+
NULL, NULL,
|
10192
|
+
LLM_NORM, cb, il);
|
10193
|
+
cb(cur, "attn_norm", il);
|
10194
|
+
|
10195
|
+
// self-attention
|
10196
|
+
{
|
10197
|
+
// compute Q and K and RoPE them
|
10198
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10199
|
+
cb(Qcur, "Qcur", il);
|
10200
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10201
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10202
|
+
cb(Qcur, "Qcur", il);
|
10203
|
+
}
|
10204
|
+
|
10205
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10206
|
+
cb(Kcur, "Kcur", il);
|
10207
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10208
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10209
|
+
cb(Kcur, "Kcur", il);
|
10210
|
+
}
|
10211
|
+
|
10212
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10213
|
+
cb(Vcur, "Vcur", il);
|
10214
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10215
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10216
|
+
cb(Vcur, "Vcur", il);
|
10217
|
+
}
|
10218
|
+
|
10219
|
+
Qcur = ggml_rope_custom(
|
10220
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10221
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10222
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10223
|
+
);
|
10224
|
+
cb(Qcur, "Qcur", il);
|
10225
|
+
|
10226
|
+
Kcur = ggml_rope_custom(
|
10227
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10228
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10229
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10230
|
+
);
|
10231
|
+
cb(Kcur, "Kcur", il);
|
10232
|
+
|
10233
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10234
|
+
model.layers[il].wo, nullptr,
|
10235
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
|
+
}
|
10237
|
+
|
10238
|
+
if (il == n_layer - 1) {
|
10239
|
+
// skip computing output for unused tokens
|
10240
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10241
|
+
n_tokens = n_outputs;
|
10242
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10243
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10244
|
+
}
|
10245
|
+
|
10246
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10247
|
+
cb(ffn_inp, "ffn_inp", il);
|
10248
|
+
|
10249
|
+
// feed-forward network
|
10250
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10251
|
+
NULL, NULL,
|
10252
|
+
LLM_NORM, cb, il);
|
10253
|
+
cb(cur, "ffn_norm", il);
|
10254
|
+
|
10255
|
+
cur = llm_build_ffn(ctx0, cur,
|
10256
|
+
model.layers[il].ffn_up, NULL,
|
10257
|
+
model.layers[il].ffn_gate, NULL,
|
10258
|
+
model.layers[il].ffn_down, NULL,
|
10259
|
+
NULL,
|
10260
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10261
|
+
cb(cur, "ffn_out", il);
|
10262
|
+
|
10263
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10264
|
+
cb(cur, "ffn_out", il);
|
10265
|
+
|
10266
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10267
|
+
if (layer_dir != nullptr) {
|
10268
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10269
|
+
}
|
10270
|
+
cb(cur, "l_out", il);
|
10271
|
+
|
10272
|
+
// input for next layer
|
10273
|
+
inpL = cur;
|
10274
|
+
}
|
10275
|
+
|
10276
|
+
cur = inpL;
|
10277
|
+
|
10278
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10279
|
+
NULL, NULL,
|
10280
|
+
LLM_NORM, cb, -1);
|
10281
|
+
cb(cur, "result_norm", -1);
|
10282
|
+
|
10283
|
+
// lm_head
|
10284
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10285
|
+
cb(cur, "result_output", -1);
|
10286
|
+
|
10287
|
+
ggml_build_forward_expand(gf, cur);
|
10288
|
+
|
10289
|
+
return gf;
|
10290
|
+
}
|
9525
10291
|
};
|
9526
10292
|
|
9527
10293
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9671,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9671
10437
|
{
|
9672
10438
|
result = llm.build_qwen2();
|
9673
10439
|
} break;
|
10440
|
+
case LLM_ARCH_QWEN2MOE:
|
10441
|
+
{
|
10442
|
+
result = llm.build_qwen2moe();
|
10443
|
+
} break;
|
9674
10444
|
case LLM_ARCH_PHI2:
|
9675
10445
|
{
|
9676
10446
|
result = llm.build_phi2();
|
@@ -9715,9 +10485,17 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9715
10485
|
{
|
9716
10486
|
result = llm.build_xverse();
|
9717
10487
|
} break;
|
9718
|
-
case LLM_ARCH_COMMAND_R:
|
10488
|
+
case LLM_ARCH_COMMAND_R:
|
10489
|
+
{
|
10490
|
+
result = llm.build_command_r();
|
10491
|
+
} break;
|
10492
|
+
case LLM_ARCH_DBRX:
|
10493
|
+
{
|
10494
|
+
result = llm.build_dbrx();
|
10495
|
+
} break;
|
10496
|
+
case LLM_ARCH_OLMO:
|
9719
10497
|
{
|
9720
|
-
result = llm.
|
10498
|
+
result = llm.build_olmo();
|
9721
10499
|
} break;
|
9722
10500
|
default:
|
9723
10501
|
GGML_ASSERT(false);
|
@@ -10409,6 +11187,9 @@ static int llama_decode_internal(
|
|
10409
11187
|
n_outputs_prev += lctx.n_outputs;
|
10410
11188
|
}
|
10411
11189
|
|
11190
|
+
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
11191
|
+
lctx.n_outputs = n_outputs;
|
11192
|
+
|
10412
11193
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
10413
11194
|
//llama_synchronize(&lctx);
|
10414
11195
|
|
@@ -11052,7 +11833,7 @@ struct llm_tokenizer_bpe {
|
|
11052
11833
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
11053
11834
|
}
|
11054
11835
|
|
11055
|
-
// add the
|
11836
|
+
// add the finished tokens to the final list keeping correct order for next and prev
|
11056
11837
|
for (auto & sym : symbols) {
|
11057
11838
|
if (sym.n > 0) {
|
11058
11839
|
sym.prev = final_prev_index;
|
@@ -11321,9 +12102,6 @@ struct llm_tokenizer_wpm {
|
|
11321
12102
|
output.push_back(vocab.special_unk_id);
|
11322
12103
|
}
|
11323
12104
|
}
|
11324
|
-
|
11325
|
-
// append eos token
|
11326
|
-
output.push_back(vocab.special_eos_id);
|
11327
12105
|
}
|
11328
12106
|
|
11329
12107
|
std::vector<std::string> preprocess(const std::string & text) {
|
@@ -11528,30 +12306,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
11528
12306
|
}
|
11529
12307
|
}
|
11530
12308
|
|
11531
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool
|
12309
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
11532
12310
|
std::vector<llama_vocab::id> output;
|
11533
|
-
|
11534
|
-
// OG tokenizer behavior:
|
11535
|
-
//
|
11536
|
-
// tokenizer.encode('', add_bos=True) returns [1]
|
11537
|
-
// tokenizer.encode('', add_bos=False) returns []
|
11538
|
-
|
11539
|
-
if (bos && vocab.special_bos_id != -1) {
|
11540
|
-
output.push_back(vocab.special_bos_id);
|
11541
|
-
}
|
11542
|
-
|
11543
|
-
if (raw_text.empty()) {
|
11544
|
-
return output;
|
11545
|
-
}
|
11546
|
-
|
11547
12311
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
11548
|
-
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11549
12312
|
|
11550
|
-
if (
|
12313
|
+
if (!raw_text.empty()) {
|
12314
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
12315
|
+
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
12316
|
+
}
|
11551
12317
|
|
11552
12318
|
switch (vocab.type) {
|
11553
12319
|
case LLAMA_VOCAB_TYPE_SPM:
|
11554
12320
|
{
|
12321
|
+
// OG tokenizer behavior:
|
12322
|
+
//
|
12323
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12324
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
12325
|
+
|
12326
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12327
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
12328
|
+
output.push_back(vocab.special_bos_id);
|
12329
|
+
}
|
12330
|
+
|
11555
12331
|
for (const auto & fragment : fragment_buffer) {
|
11556
12332
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11557
12333
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
@@ -11577,9 +12353,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11577
12353
|
output.push_back(fragment.token);
|
11578
12354
|
}
|
11579
12355
|
}
|
12356
|
+
|
12357
|
+
if (add_special && vocab.special_add_eos == 1) {
|
12358
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
12359
|
+
output.push_back(vocab.special_eos_id);
|
12360
|
+
}
|
11580
12361
|
} break;
|
11581
12362
|
case LLAMA_VOCAB_TYPE_BPE:
|
11582
12363
|
{
|
12364
|
+
if (add_special && vocab.special_add_bos == 1) {
|
12365
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
12366
|
+
output.push_back(vocab.special_bos_id);
|
12367
|
+
}
|
12368
|
+
|
11583
12369
|
for (const auto & fragment : fragment_buffer) {
|
11584
12370
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11585
12371
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11593,9 +12379,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11593
12379
|
output.push_back(fragment.token);
|
11594
12380
|
}
|
11595
12381
|
}
|
12382
|
+
|
12383
|
+
GGML_ASSERT(vocab.special_add_eos != 1);
|
11596
12384
|
} break;
|
11597
12385
|
case LLAMA_VOCAB_TYPE_WPM:
|
11598
12386
|
{
|
12387
|
+
if (add_special) {
|
12388
|
+
GGML_ASSERT(vocab.special_cls_id != -1);
|
12389
|
+
output.push_back(vocab.special_cls_id);
|
12390
|
+
}
|
12391
|
+
|
11599
12392
|
for (const auto & fragment : fragment_buffer) {
|
11600
12393
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11601
12394
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11609,6 +12402,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11609
12402
|
output.push_back(fragment.token);
|
11610
12403
|
}
|
11611
12404
|
}
|
12405
|
+
|
12406
|
+
if (add_special) {
|
12407
|
+
GGML_ASSERT(vocab.special_sep_id != -1);
|
12408
|
+
output.push_back(vocab.special_sep_id);
|
12409
|
+
}
|
11612
12410
|
} break;
|
11613
12411
|
case LLAMA_VOCAB_TYPE_NONE:
|
11614
12412
|
GGML_ASSERT(false);
|
@@ -11775,7 +12573,9 @@ static void llama_grammar_advance_stack(
|
|
11775
12573
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11776
12574
|
|
11777
12575
|
if (stack.empty()) {
|
11778
|
-
new_stacks.
|
12576
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
12577
|
+
new_stacks.emplace_back(stack);
|
12578
|
+
}
|
11779
12579
|
return;
|
11780
12580
|
}
|
11781
12581
|
|
@@ -11812,7 +12612,10 @@ static void llama_grammar_advance_stack(
|
|
11812
12612
|
}
|
11813
12613
|
case LLAMA_GRETYPE_CHAR:
|
11814
12614
|
case LLAMA_GRETYPE_CHAR_NOT:
|
11815
|
-
new_stacks.
|
12615
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
12616
|
+
// only add the stack if it's not a duplicate of one we already have
|
12617
|
+
new_stacks.emplace_back(stack);
|
12618
|
+
}
|
11816
12619
|
break;
|
11817
12620
|
default:
|
11818
12621
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -11826,12 +12629,13 @@ static void llama_grammar_advance_stack(
|
|
11826
12629
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
11827
12630
|
// produces the N possible stacks if the given char is accepted at those
|
11828
12631
|
// positions
|
11829
|
-
|
12632
|
+
void llama_grammar_accept(
|
11830
12633
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
11831
12634
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
11832
|
-
const uint32_t chr
|
12635
|
+
const uint32_t chr,
|
12636
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11833
12637
|
|
11834
|
-
|
12638
|
+
new_stacks.clear();
|
11835
12639
|
|
11836
12640
|
for (const auto & stack : stacks) {
|
11837
12641
|
if (stack.empty()) {
|
@@ -11850,8 +12654,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|
11850
12654
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
11851
12655
|
}
|
11852
12656
|
}
|
11853
|
-
|
11854
|
-
return new_stacks;
|
11855
12657
|
}
|
11856
12658
|
|
11857
12659
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
@@ -11865,6 +12667,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11865
12667
|
const std::vector<llama_grammar_candidate> & candidates) {
|
11866
12668
|
|
11867
12669
|
std::vector<llama_grammar_candidate> rejects;
|
12670
|
+
rejects.reserve(candidates.size());
|
11868
12671
|
|
11869
12672
|
if (stack.empty()) {
|
11870
12673
|
for (const auto & tok : candidates) {
|
@@ -11878,6 +12681,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11878
12681
|
const llama_grammar_element * stack_pos = stack.back();
|
11879
12682
|
|
11880
12683
|
std::vector<llama_grammar_candidate> next_candidates;
|
12684
|
+
next_candidates.reserve(candidates.size());
|
12685
|
+
|
11881
12686
|
for (const auto & tok : candidates) {
|
11882
12687
|
if (*tok.code_points == 0) {
|
11883
12688
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
@@ -12685,8 +13490,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12685
13490
|
// Note terminating 0 in decoded string
|
12686
13491
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
12687
13492
|
const auto & code_points = decoded.first;
|
13493
|
+
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
12688
13494
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
12689
|
-
|
13495
|
+
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
13496
|
+
grammar->stacks = tmp_new_stacks;
|
12690
13497
|
}
|
12691
13498
|
grammar->partial_utf8 = decoded.second;
|
12692
13499
|
GGML_ASSERT(!grammar->stacks.empty());
|
@@ -12820,6 +13627,11 @@ struct llama_beam_search_data {
|
|
12820
13627
|
}
|
12821
13628
|
llama_logit_info logit_info(ctx);
|
12822
13629
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13630
|
+
|
13631
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13632
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13633
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13634
|
+
|
12823
13635
|
size_t i=0;
|
12824
13636
|
if (next_beams.size() < n_beams) {
|
12825
13637
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13318,9 +14130,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
13318
14130
|
return new_type;
|
13319
14131
|
}
|
13320
14132
|
|
13321
|
-
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const
|
14133
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
13322
14134
|
std::mutex mutex;
|
13323
|
-
|
14135
|
+
int64_t counter = 0;
|
13324
14136
|
size_t new_size = 0;
|
13325
14137
|
if (nthread < 2) {
|
13326
14138
|
// single-thread
|
@@ -13328,11 +14140,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13328
14140
|
}
|
13329
14141
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
13330
14142
|
nrows, n_per_row, imatrix]() {
|
13331
|
-
const
|
14143
|
+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
13332
14144
|
size_t local_size = 0;
|
13333
14145
|
while (true) {
|
13334
14146
|
std::unique_lock<std::mutex> lock(mutex);
|
13335
|
-
|
14147
|
+
int64_t first_row = counter; counter += nrows_per_chunk;
|
13336
14148
|
if (first_row >= nrows) {
|
13337
14149
|
if (local_size > 0) {
|
13338
14150
|
new_size += local_size;
|
@@ -13340,7 +14152,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13340
14152
|
break;
|
13341
14153
|
}
|
13342
14154
|
lock.unlock();
|
13343
|
-
const
|
14155
|
+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
13344
14156
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
13345
14157
|
}
|
13346
14158
|
};
|
@@ -13440,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13440
14252
|
gguf_set_kv (ctx_out, ml.meta);
|
13441
14253
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13442
14254
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14255
|
+
// Remove split metadata
|
14256
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14257
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14258
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13443
14259
|
|
13444
14260
|
if (params->kv_overrides) {
|
13445
14261
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -13463,7 +14279,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13463
14279
|
const std::string name = ggml_get_name(meta);
|
13464
14280
|
|
13465
14281
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
13466
|
-
if (name.find("attn_v.weight")
|
14282
|
+
if (name.find("attn_v.weight") != std::string::npos ||
|
14283
|
+
name.find("attn_qkv.weight") != std::string::npos) {
|
13467
14284
|
++qs.n_attention_wv;
|
13468
14285
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13469
14286
|
qs.has_output = true;
|
@@ -13473,7 +14290,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13473
14290
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
14291
|
|
13475
14292
|
// sanity checks
|
13476
|
-
|
14293
|
+
//
|
14294
|
+
// - qs.n_attention_wv == 0 for Mamba models
|
14295
|
+
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
14296
|
+
//
|
14297
|
+
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
13477
14298
|
|
13478
14299
|
size_t total_size_org = 0;
|
13479
14300
|
size_t total_size_new = 0;
|
@@ -13529,6 +14350,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13529
14350
|
|
13530
14351
|
// quantize only 2D and 3D tensors (experts)
|
13531
14352
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
14353
|
+
|
14354
|
+
// do not quantize norm tensors
|
14355
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
14356
|
+
|
13532
14357
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
13533
14358
|
quantize &= !params->only_copy;
|
13534
14359
|
|
@@ -13557,10 +14382,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13557
14382
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
13558
14383
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
13559
14384
|
}
|
13560
|
-
|
14385
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
14386
|
new_type = params->token_embedding_type;
|
13562
14387
|
}
|
13563
|
-
|
14388
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
14389
|
new_type = params->output_tensor_type;
|
13565
14390
|
}
|
13566
14391
|
|
@@ -13575,7 +14400,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13575
14400
|
new_size = ggml_nbytes(tensor);
|
13576
14401
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
13577
14402
|
} else {
|
13578
|
-
const
|
14403
|
+
const int64_t nelements = ggml_nelements(tensor);
|
13579
14404
|
|
13580
14405
|
const float * imatrix = nullptr;
|
13581
14406
|
if (imatrix_data) {
|
@@ -13627,20 +14452,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13627
14452
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
13628
14453
|
fflush(stdout);
|
13629
14454
|
|
13630
|
-
if (work.size() < nelements * 4) {
|
14455
|
+
if (work.size() < (size_t)nelements * 4) {
|
13631
14456
|
work.resize(nelements * 4); // upper bound on size
|
13632
14457
|
}
|
13633
14458
|
new_data = work.data();
|
13634
14459
|
|
13635
|
-
const
|
13636
|
-
const
|
14460
|
+
const int64_t n_per_row = tensor->ne[0];
|
14461
|
+
const int64_t nrows = tensor->ne[1];
|
13637
14462
|
|
13638
|
-
static const
|
13639
|
-
const
|
14463
|
+
static const int64_t min_chunk_size = 32 * 512;
|
14464
|
+
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
13640
14465
|
|
13641
|
-
const
|
13642
|
-
const
|
13643
|
-
const
|
14466
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
14467
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
14468
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
13644
14469
|
|
13645
14470
|
// quantize each expert separately since they have different importance matrices
|
13646
14471
|
new_size = 0;
|
@@ -14525,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14525
15350
|
case LLM_ARCH_MINICPM:
|
14526
15351
|
case LLM_ARCH_XVERSE:
|
14527
15352
|
case LLM_ARCH_COMMAND_R:
|
15353
|
+
case LLM_ARCH_OLMO:
|
14528
15354
|
return LLAMA_ROPE_TYPE_NORM;
|
14529
15355
|
|
14530
15356
|
// the pairs of head values are offset by n_rot/2
|
14531
15357
|
case LLM_ARCH_FALCON:
|
14532
15358
|
case LLM_ARCH_GROK:
|
15359
|
+
case LLM_ARCH_DBRX:
|
14533
15360
|
case LLM_ARCH_PERSIMMON:
|
14534
15361
|
case LLM_ARCH_BERT:
|
14535
15362
|
case LLM_ARCH_NOMIC_BERT:
|
14536
15363
|
case LLM_ARCH_STABLELM:
|
14537
15364
|
case LLM_ARCH_QWEN:
|
14538
15365
|
case LLM_ARCH_QWEN2:
|
15366
|
+
case LLM_ARCH_QWEN2MOE:
|
14539
15367
|
case LLM_ARCH_PHI2:
|
14540
15368
|
case LLM_ARCH_GEMMA:
|
14541
15369
|
case LLM_ARCH_STARCODER2:
|
@@ -14905,9 +15733,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
14905
15733
|
llama_kv_cache_update_internal(*ctx);
|
14906
15734
|
}
|
14907
15735
|
|
15736
|
+
// deprecated
|
15737
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
15738
|
+
return llama_state_get_size(ctx);
|
15739
|
+
}
|
15740
|
+
|
15741
|
+
// deprecated
|
15742
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
15743
|
+
return llama_state_get_data(ctx, dst);
|
15744
|
+
}
|
15745
|
+
|
15746
|
+
// deprecated
|
15747
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
15748
|
+
return llama_state_set_data(ctx, src);
|
15749
|
+
}
|
15750
|
+
|
15751
|
+
// deprecated
|
15752
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15753
|
+
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15754
|
+
}
|
15755
|
+
|
15756
|
+
// deprecated
|
15757
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15758
|
+
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
15759
|
+
}
|
14908
15760
|
|
14909
15761
|
// Returns the *maximum* size of the state
|
14910
|
-
size_t
|
15762
|
+
size_t llama_state_get_size(const struct llama_context * ctx) {
|
14911
15763
|
const auto & cparams = ctx->cparams;
|
14912
15764
|
const auto & hparams = ctx->model.hparams;
|
14913
15765
|
|
@@ -14995,15 +15847,15 @@ struct llama_data_file_context : llama_data_context {
|
|
14995
15847
|
* file context:
|
14996
15848
|
* llama_file file("/path", "wb");
|
14997
15849
|
* llama_data_file_context data_ctx(&file);
|
14998
|
-
*
|
15850
|
+
* llama_state_get_data(ctx, &data_ctx);
|
14999
15851
|
*
|
15000
15852
|
* buffer context:
|
15001
15853
|
* std::vector<uint8_t> buf(max_size, 0);
|
15002
15854
|
* llama_data_buffer_context data_ctx(&buf.data());
|
15003
|
-
*
|
15855
|
+
* llama_state_get_data(ctx, &data_ctx);
|
15004
15856
|
*
|
15005
15857
|
*/
|
15006
|
-
static void
|
15858
|
+
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
15007
15859
|
// copy rng
|
15008
15860
|
{
|
15009
15861
|
std::ostringstream rng_ss;
|
@@ -15147,15 +15999,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
15147
15999
|
}
|
15148
16000
|
}
|
15149
16001
|
|
15150
|
-
size_t
|
16002
|
+
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
15151
16003
|
llama_data_buffer_context data_ctx(dst);
|
15152
|
-
|
16004
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15153
16005
|
|
15154
16006
|
return data_ctx.get_size_written();
|
15155
16007
|
}
|
15156
16008
|
|
15157
16009
|
// Sets the state reading from the specified source address
|
15158
|
-
size_t
|
16010
|
+
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
15159
16011
|
const uint8_t * inp = src;
|
15160
16012
|
|
15161
16013
|
// set rng
|
@@ -15192,6 +16044,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15192
16044
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
16045
|
ctx->output_ids[id] = i;
|
15194
16046
|
}
|
16047
|
+
|
16048
|
+
ctx->n_outputs = n_outputs;
|
15195
16049
|
}
|
15196
16050
|
}
|
15197
16051
|
|
@@ -15307,14 +16161,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15307
16161
|
}
|
15308
16162
|
|
15309
16163
|
const size_t nread = inp - src;
|
15310
|
-
const size_t max_size =
|
16164
|
+
const size_t max_size = llama_state_get_size(ctx);
|
15311
16165
|
|
15312
16166
|
GGML_ASSERT(nread <= max_size);
|
15313
16167
|
|
15314
16168
|
return nread;
|
15315
16169
|
}
|
15316
16170
|
|
15317
|
-
static bool
|
16171
|
+
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15318
16172
|
llama_file file(path_session, "rb");
|
15319
16173
|
|
15320
16174
|
// sanity checks
|
@@ -15352,7 +16206,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15352
16206
|
// restore the context state
|
15353
16207
|
{
|
15354
16208
|
const size_t n_state_size_cur = file.size - file.tell();
|
15355
|
-
const size_t n_state_size_max =
|
16209
|
+
const size_t n_state_size_max = llama_state_get_size(ctx);
|
15356
16210
|
|
15357
16211
|
if (n_state_size_cur > n_state_size_max) {
|
15358
16212
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
@@ -15362,22 +16216,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15362
16216
|
std::vector<uint8_t> state_data(n_state_size_max);
|
15363
16217
|
file.read_raw(state_data.data(), n_state_size_cur);
|
15364
16218
|
|
15365
|
-
|
16219
|
+
llama_state_set_data(ctx, state_data.data());
|
15366
16220
|
}
|
15367
16221
|
|
15368
16222
|
return true;
|
15369
16223
|
}
|
15370
16224
|
|
15371
|
-
bool
|
16225
|
+
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15372
16226
|
try {
|
15373
|
-
return
|
16227
|
+
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15374
16228
|
} catch (const std::exception & err) {
|
15375
16229
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
15376
16230
|
return false;
|
15377
16231
|
}
|
15378
16232
|
}
|
15379
16233
|
|
15380
|
-
bool
|
16234
|
+
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15381
16235
|
llama_file file(path_session, "wb");
|
15382
16236
|
|
15383
16237
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
@@ -15391,11 +16245,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
15391
16245
|
|
15392
16246
|
// save the context state using stream saving
|
15393
16247
|
llama_data_file_context data_ctx(&file);
|
15394
|
-
|
16248
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15395
16249
|
|
15396
16250
|
return true;
|
15397
16251
|
}
|
15398
16252
|
|
16253
|
+
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
16254
|
+
try {
|
16255
|
+
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
16256
|
+
} catch (const std::exception & err) {
|
16257
|
+
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
16258
|
+
return false;
|
16259
|
+
}
|
16260
|
+
}
|
16261
|
+
|
16262
|
+
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
16263
|
+
// save the size of size_t as a uint32_t for safety check
|
16264
|
+
const size_t size_t_size_size = sizeof(uint32_t);
|
16265
|
+
|
16266
|
+
// other values
|
16267
|
+
const size_t s_cell_count_size = sizeof(uint32_t);
|
16268
|
+
const size_t s_layer_count_size = sizeof(uint32_t);
|
16269
|
+
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
16270
|
+
|
16271
|
+
size_t s_cell_count = 0;
|
16272
|
+
size_t s_cell_data_size = 0;
|
16273
|
+
const auto & kv_self = ctx->kv_self;
|
16274
|
+
const auto & hparams = ctx->model.hparams;
|
16275
|
+
|
16276
|
+
const uint32_t n_layer = hparams.n_layer;
|
16277
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16278
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16279
|
+
|
16280
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
16281
|
+
const auto & cell = kv_self.cells[i];
|
16282
|
+
if (cell.seq_id.count(seq_id) > 0) {
|
16283
|
+
++s_cell_count;
|
16284
|
+
s_cell_data_size += sizeof(llama_pos);
|
16285
|
+
}
|
16286
|
+
}
|
16287
|
+
|
16288
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16289
|
+
// types of keys and values
|
16290
|
+
s_cell_data_size += sizeof(int32_t) * 2;
|
16291
|
+
// k_size_row and v_size_el values of layer
|
16292
|
+
s_cell_data_size += sizeof(size_t) * 2;
|
16293
|
+
|
16294
|
+
// keys
|
16295
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16296
|
+
s_cell_data_size += k_size_row * s_cell_count;
|
16297
|
+
|
16298
|
+
// values (transposed)
|
16299
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16300
|
+
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
16301
|
+
}
|
16302
|
+
|
16303
|
+
const size_t s_total = (
|
16304
|
+
size_t_size_size +
|
16305
|
+
s_cell_count_size +
|
16306
|
+
s_layer_count_size +
|
16307
|
+
n_embd_v_gqa_size +
|
16308
|
+
s_cell_data_size
|
16309
|
+
);
|
16310
|
+
|
16311
|
+
return s_total;
|
16312
|
+
}
|
16313
|
+
|
16314
|
+
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16315
|
+
const auto & kv_self = ctx->kv_self;
|
16316
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
|
+
|
16318
|
+
// Save the size of size_t as a uint32_t for safety check
|
16319
|
+
const uint32_t size_t_size = sizeof(size_t);
|
16320
|
+
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
16321
|
+
|
16322
|
+
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
16323
|
+
uint32_t cell_count = 0;
|
16324
|
+
|
16325
|
+
// Count the number of cells with the specified seq_id
|
16326
|
+
// Find all the ranges of cells with this seq id
|
16327
|
+
{
|
16328
|
+
uint32_t cell_range_begin = kv_self.size;
|
16329
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
16330
|
+
const auto & cell = kv_self.cells[i];
|
16331
|
+
if (cell.has_seq_id(seq_id)) {
|
16332
|
+
++cell_count;
|
16333
|
+
if (cell_range_begin == kv_self.size) {
|
16334
|
+
cell_range_begin = i;
|
16335
|
+
}
|
16336
|
+
}
|
16337
|
+
else {
|
16338
|
+
if (cell_range_begin != kv_self.size) {
|
16339
|
+
cell_ranges.push_back({ cell_range_begin, i });
|
16340
|
+
cell_range_begin = kv_self.size;
|
16341
|
+
}
|
16342
|
+
}
|
16343
|
+
}
|
16344
|
+
if (cell_range_begin != kv_self.size) {
|
16345
|
+
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
16346
|
+
}
|
16347
|
+
|
16348
|
+
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
16349
|
+
uint32_t cell_count_check = 0;
|
16350
|
+
for (const auto & range : cell_ranges) {
|
16351
|
+
cell_count_check += range.second - range.first;
|
16352
|
+
}
|
16353
|
+
GGML_ASSERT(cell_count == cell_count_check);
|
16354
|
+
}
|
16355
|
+
|
16356
|
+
// Write the cell count
|
16357
|
+
data_ctx.write(&cell_count, sizeof(cell_count));
|
16358
|
+
|
16359
|
+
const auto & hparams = ctx->model.hparams;
|
16360
|
+
const uint32_t n_layer = hparams.n_layer;
|
16361
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16362
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16363
|
+
|
16364
|
+
// Write the layer count
|
16365
|
+
data_ctx.write(&n_layer, sizeof(n_layer));
|
16366
|
+
|
16367
|
+
// Write n_embd_v_gqa
|
16368
|
+
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
16369
|
+
|
16370
|
+
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
16371
|
+
for (const auto & range : cell_ranges) {
|
16372
|
+
for (uint32_t i = range.first; i < range.second; ++i) {
|
16373
|
+
const auto & cell = kv_self.cells[i];
|
16374
|
+
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
16375
|
+
}
|
16376
|
+
}
|
16377
|
+
|
16378
|
+
// Iterate and write all the keys first, each row is a cell
|
16379
|
+
// Get whole range at a time
|
16380
|
+
std::vector<uint8_t> tmp_buf;
|
16381
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16382
|
+
// Write key type
|
16383
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
16384
|
+
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
16385
|
+
|
16386
|
+
// Write row size of key
|
16387
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16388
|
+
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
16389
|
+
|
16390
|
+
// Read each range of cells of k_size length each into tmp_buf and write out
|
16391
|
+
for (const auto & range : cell_ranges) {
|
16392
|
+
const size_t range_size = range.second - range.first;
|
16393
|
+
tmp_buf.resize(range_size * k_size_row);
|
16394
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
16395
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16396
|
+
}
|
16397
|
+
}
|
16398
|
+
|
16399
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16400
|
+
const uint32_t kv_size = kv_self.size;
|
16401
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16402
|
+
// Write value type
|
16403
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16404
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16405
|
+
|
16406
|
+
// Write element size
|
16407
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16408
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16409
|
+
|
16410
|
+
// For each row, we get the element values of each cell
|
16411
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16412
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16413
|
+
for (const auto & range : cell_ranges) {
|
16414
|
+
const size_t range_size = range.second - range.first;
|
16415
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16416
|
+
tmp_buf.resize(range_size * v_size_el);
|
16417
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16418
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16419
|
+
}
|
16420
|
+
}
|
16421
|
+
}
|
16422
|
+
|
16423
|
+
return data_ctx.get_size_written();
|
16424
|
+
}
|
16425
|
+
|
16426
|
+
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
16427
|
+
llama_data_buffer_context data_ctx(dst);
|
16428
|
+
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
16429
|
+
}
|
16430
|
+
|
16431
|
+
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16432
|
+
auto & kv_self = ctx->kv_self;
|
16433
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
|
+
|
16435
|
+
// Wipe the slot
|
16436
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16437
|
+
|
16438
|
+
const uint8_t * inp = src;
|
16439
|
+
|
16440
|
+
// Read size of size_t
|
16441
|
+
uint32_t size_t_size;
|
16442
|
+
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
16443
|
+
inp += sizeof(size_t_size);
|
16444
|
+
if (size_t_size != sizeof(size_t)) {
|
16445
|
+
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
16446
|
+
return 0;
|
16447
|
+
}
|
16448
|
+
|
16449
|
+
// Read the cell count
|
16450
|
+
uint32_t cell_count;
|
16451
|
+
memcpy(&cell_count, inp, sizeof(cell_count));
|
16452
|
+
inp += sizeof(cell_count);
|
16453
|
+
|
16454
|
+
// Read the layer count
|
16455
|
+
uint32_t n_layer_ref;
|
16456
|
+
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
16457
|
+
inp += sizeof(n_layer_ref);
|
16458
|
+
|
16459
|
+
// Read n_embd_v_gqa
|
16460
|
+
uint32_t n_embd_v_gqa_ref;
|
16461
|
+
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
16462
|
+
inp += sizeof(n_embd_v_gqa_ref);
|
16463
|
+
|
16464
|
+
// Sanity check model compatibility
|
16465
|
+
const auto & hparams = ctx->model.hparams;
|
16466
|
+
const uint32_t n_layer = hparams.n_layer;
|
16467
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16468
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16469
|
+
if (n_layer != n_layer_ref) {
|
16470
|
+
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
16471
|
+
return 0;
|
16472
|
+
}
|
16473
|
+
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
16474
|
+
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
16475
|
+
return 0;
|
16476
|
+
}
|
16477
|
+
|
16478
|
+
// Allocate the new cells for the slot
|
16479
|
+
if (cell_count) {
|
16480
|
+
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
16481
|
+
batch.n_tokens = cell_count;
|
16482
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
16483
|
+
llama_pos pos;
|
16484
|
+
memcpy(&pos, inp, sizeof(pos));
|
16485
|
+
inp += sizeof(pos);
|
16486
|
+
|
16487
|
+
batch.pos[i] = pos;
|
16488
|
+
batch.n_seq_id[i] = 1;
|
16489
|
+
batch.seq_id[i][0] = dest_seq_id;
|
16490
|
+
}
|
16491
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
16492
|
+
llama_batch_free(batch);
|
16493
|
+
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
16494
|
+
return 0;
|
16495
|
+
}
|
16496
|
+
|
16497
|
+
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
16498
|
+
// Assume that this is one contiguous block of cells
|
16499
|
+
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
16500
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
16501
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
16502
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
16503
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
16504
|
+
|
16505
|
+
// Cleanup
|
16506
|
+
llama_batch_free(batch);
|
16507
|
+
}
|
16508
|
+
|
16509
|
+
const uint32_t kv_size = kv_self.size;
|
16510
|
+
const uint32_t kv_head = kv_self.head;
|
16511
|
+
|
16512
|
+
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
16513
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16514
|
+
// Read type of key
|
16515
|
+
int32_t k_type_i_ref;
|
16516
|
+
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
16517
|
+
inp += sizeof(k_type_i_ref);
|
16518
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
16519
|
+
if (k_type_i != k_type_i_ref) {
|
16520
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16521
|
+
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
16522
|
+
return 0;
|
16523
|
+
}
|
16524
|
+
|
16525
|
+
// Read row size of key
|
16526
|
+
size_t k_size_row_ref;
|
16527
|
+
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
16528
|
+
inp += sizeof(k_size_row_ref);
|
16529
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16530
|
+
if (k_size_row != k_size_row_ref) {
|
16531
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16532
|
+
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
16533
|
+
return 0;
|
16534
|
+
}
|
16535
|
+
|
16536
|
+
if (cell_count) {
|
16537
|
+
// Read and set the keys for the whole cell range
|
16538
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
16539
|
+
inp += cell_count * k_size_row;
|
16540
|
+
}
|
16541
|
+
}
|
16542
|
+
|
16543
|
+
// For each layer, read the values for each cell (transposed)
|
16544
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16545
|
+
// Read type of value
|
16546
|
+
int32_t v_type_i_ref;
|
16547
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
16548
|
+
inp += sizeof(v_type_i_ref);
|
16549
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16550
|
+
if (v_type_i != v_type_i_ref) {
|
16551
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16552
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
16553
|
+
return 0;
|
16554
|
+
}
|
16555
|
+
|
16556
|
+
// Read element size of value
|
16557
|
+
size_t v_size_el_ref;
|
16558
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
16559
|
+
inp += sizeof(v_size_el_ref);
|
16560
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16561
|
+
if (v_size_el != v_size_el_ref) {
|
16562
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16563
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
16564
|
+
return 0;
|
16565
|
+
}
|
16566
|
+
|
16567
|
+
if (cell_count) {
|
16568
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
16569
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16570
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
16571
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
16572
|
+
inp += cell_count * v_size_el;
|
16573
|
+
}
|
16574
|
+
}
|
16575
|
+
}
|
16576
|
+
|
16577
|
+
const size_t nread = inp - src;
|
16578
|
+
return nread;
|
16579
|
+
}
|
16580
|
+
|
16581
|
+
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
16582
|
+
llama_file file(filepath, "wb");
|
16583
|
+
|
16584
|
+
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
16585
|
+
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
16586
|
+
|
16587
|
+
// save the prompt
|
16588
|
+
file.write_u32((uint32_t)n_token_count);
|
16589
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
16590
|
+
|
16591
|
+
// save the context state using stream saving
|
16592
|
+
llama_data_file_context data_ctx(&file);
|
16593
|
+
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
16594
|
+
|
16595
|
+
const size_t res = file.tell();
|
16596
|
+
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
16597
|
+
return res;
|
16598
|
+
}
|
16599
|
+
|
16600
|
+
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
16601
|
+
llama_file file(filepath, "rb");
|
16602
|
+
|
16603
|
+
// version checks
|
16604
|
+
{
|
16605
|
+
const uint32_t magic = file.read_u32();
|
16606
|
+
const uint32_t version = file.read_u32();
|
16607
|
+
|
16608
|
+
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
16609
|
+
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
16610
|
+
return 0;
|
16611
|
+
}
|
16612
|
+
}
|
16613
|
+
|
16614
|
+
// load the prompt
|
16615
|
+
{
|
16616
|
+
const uint32_t n_token_count = file.read_u32();
|
16617
|
+
|
16618
|
+
if (n_token_count > n_token_capacity) {
|
16619
|
+
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
16620
|
+
return 0;
|
16621
|
+
}
|
16622
|
+
|
16623
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
16624
|
+
*n_token_count_out = n_token_count;
|
16625
|
+
}
|
16626
|
+
|
16627
|
+
// restore the context state
|
16628
|
+
{
|
16629
|
+
const size_t state_size = file.size - file.tell();
|
16630
|
+
std::vector<uint8_t> state_data(state_size);
|
16631
|
+
file.read_raw(state_data.data(), state_size);
|
16632
|
+
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
16633
|
+
if (!nread) {
|
16634
|
+
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
16635
|
+
return 0;
|
16636
|
+
}
|
16637
|
+
GGML_ASSERT(nread <= state_size);
|
16638
|
+
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
16639
|
+
}
|
16640
|
+
|
16641
|
+
return file.tell();
|
16642
|
+
}
|
16643
|
+
|
16644
|
+
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
16645
|
+
try {
|
16646
|
+
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
16647
|
+
} catch (const std::exception & err) {
|
16648
|
+
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
16649
|
+
return 0;
|
16650
|
+
}
|
16651
|
+
}
|
16652
|
+
|
16653
|
+
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
16654
|
+
try {
|
16655
|
+
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
16656
|
+
} catch (const std::exception & err) {
|
16657
|
+
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
16658
|
+
return 0;
|
16659
|
+
}
|
16660
|
+
}
|
16661
|
+
|
15399
16662
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
15400
16663
|
ctx->cparams.n_threads = n_threads;
|
15401
16664
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
@@ -15509,23 +16772,31 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
15509
16772
|
}
|
15510
16773
|
|
15511
16774
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
16775
|
+
int32_t j = -1;
|
15512
16776
|
llama_synchronize(ctx);
|
15513
16777
|
|
15514
16778
|
try {
|
15515
16779
|
if (ctx->logits == nullptr) {
|
15516
16780
|
throw std::runtime_error("no logits");
|
15517
16781
|
}
|
15518
|
-
|
16782
|
+
|
16783
|
+
if (i < 0) {
|
16784
|
+
j = ctx->n_outputs + i;
|
16785
|
+
if (j < 0) {
|
16786
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16787
|
+
}
|
16788
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15519
16789
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16790
|
+
} else {
|
16791
|
+
j = ctx->output_ids[i];
|
15520
16792
|
}
|
15521
|
-
const int32_t j = ctx->output_ids[i];
|
15522
16793
|
|
15523
16794
|
if (j < 0) {
|
15524
16795
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
16796
|
}
|
15526
|
-
if (
|
16797
|
+
if (j >= ctx->n_outputs) {
|
15527
16798
|
// This should not happen
|
15528
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16799
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15529
16800
|
}
|
15530
16801
|
|
15531
16802
|
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
@@ -15545,23 +16816,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
15545
16816
|
}
|
15546
16817
|
|
15547
16818
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
16819
|
+
int32_t j = -1;
|
16820
|
+
|
15548
16821
|
llama_synchronize(ctx);
|
15549
16822
|
|
15550
16823
|
try {
|
15551
16824
|
if (ctx->embd == nullptr) {
|
15552
16825
|
throw std::runtime_error("no embeddings");
|
15553
16826
|
}
|
15554
|
-
|
16827
|
+
|
16828
|
+
if (i < 0) {
|
16829
|
+
j = ctx->n_outputs + i;
|
16830
|
+
if (j < 0) {
|
16831
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16832
|
+
}
|
16833
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15555
16834
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16835
|
+
} else {
|
16836
|
+
j = ctx->output_ids[i];
|
15556
16837
|
}
|
15557
|
-
const int32_t j = ctx->output_ids[i];
|
15558
16838
|
|
15559
16839
|
if (j < 0) {
|
15560
16840
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
16841
|
}
|
15562
|
-
if (
|
16842
|
+
if (j >= ctx->n_outputs) {
|
15563
16843
|
// This should not happen
|
15564
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16844
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15565
16845
|
}
|
15566
16846
|
|
15567
16847
|
return ctx->embd + j*ctx->model.hparams.n_embd;
|
@@ -15608,6 +16888,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
15608
16888
|
return model->vocab.special_eos_id;
|
15609
16889
|
}
|
15610
16890
|
|
16891
|
+
llama_token llama_token_cls(const struct llama_model * model) {
|
16892
|
+
return model->vocab.special_cls_id;
|
16893
|
+
}
|
16894
|
+
|
16895
|
+
llama_token llama_token_sep(const struct llama_model * model) {
|
16896
|
+
return model->vocab.special_sep_id;
|
16897
|
+
}
|
16898
|
+
|
15611
16899
|
llama_token llama_token_nl(const struct llama_model * model) {
|
15612
16900
|
return model->vocab.linefeed_id;
|
15613
16901
|
}
|
@@ -15642,9 +16930,9 @@ int32_t llama_tokenize(
|
|
15642
16930
|
int32_t text_len,
|
15643
16931
|
llama_token * tokens,
|
15644
16932
|
int32_t n_tokens_max,
|
15645
|
-
bool
|
15646
|
-
bool
|
15647
|
-
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),
|
16933
|
+
bool add_special,
|
16934
|
+
bool parse_special) {
|
16935
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
15648
16936
|
|
15649
16937
|
if (n_tokens_max < (int) res.size()) {
|
15650
16938
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -15910,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
|
|
15910
17198
|
if (add_ass) {
|
15911
17199
|
ss << "### Response:\n";
|
15912
17200
|
}
|
17201
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17202
|
+
// CohereForAI/c4ai-command-r-plus
|
17203
|
+
for (auto message : chat) {
|
17204
|
+
std::string role(message->role);
|
17205
|
+
if (role == "system") {
|
17206
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17207
|
+
} else if (role == "user") {
|
17208
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17209
|
+
} else if (role == "assistant") {
|
17210
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17211
|
+
}
|
17212
|
+
}
|
17213
|
+
if (add_ass) {
|
17214
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
|
+
}
|
15913
17216
|
} else {
|
15914
17217
|
// template not supported
|
15915
17218
|
return -1;
|