llama_cpp 0.14.4 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +23 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +10 -0
- data/vendor/tmp/llama.cpp/LICENSE +1 -1
- data/vendor/tmp/llama.cpp/Makefile +29 -9
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +142 -49
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +156 -156
- data/vendor/tmp/llama.cpp/ggml-quants.h +82 -82
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +942 -267
- data/vendor/tmp/llama.cpp/ggml.c +161 -95
- data/vendor/tmp/llama.cpp/ggml.h +12 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1577 -274
- data/vendor/tmp/llama.cpp/llama.h +81 -13
- data/vendor/tmp/llama.cpp/sgemm.cpp +1148 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
@@ -105,7 +105,7 @@
|
|
105
105
|
#endif
|
106
106
|
|
107
107
|
#define LLAMA_MAX_NODES 8192
|
108
|
-
#define LLAMA_MAX_EXPERTS
|
108
|
+
#define LLAMA_MAX_EXPERTS 60
|
109
109
|
|
110
110
|
|
111
111
|
//
|
@@ -209,6 +209,7 @@ enum llm_arch {
|
|
209
209
|
LLM_ARCH_STABLELM,
|
210
210
|
LLM_ARCH_QWEN,
|
211
211
|
LLM_ARCH_QWEN2,
|
212
|
+
LLM_ARCH_QWEN2MOE,
|
212
213
|
LLM_ARCH_PHI2,
|
213
214
|
LLM_ARCH_PLAMO,
|
214
215
|
LLM_ARCH_CODESHELL,
|
@@ -220,6 +221,8 @@ enum llm_arch {
|
|
220
221
|
LLM_ARCH_MAMBA,
|
221
222
|
LLM_ARCH_XVERSE,
|
222
223
|
LLM_ARCH_COMMAND_R,
|
224
|
+
LLM_ARCH_DBRX,
|
225
|
+
LLM_ARCH_OLMO,
|
223
226
|
LLM_ARCH_UNKNOWN,
|
224
227
|
};
|
225
228
|
|
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
241
244
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
242
245
|
{ LLM_ARCH_QWEN, "qwen" },
|
243
246
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
247
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
244
248
|
{ LLM_ARCH_PHI2, "phi2" },
|
245
249
|
{ LLM_ARCH_PLAMO, "plamo" },
|
246
250
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
256
|
{ LLM_ARCH_MAMBA, "mamba" },
|
253
257
|
{ LLM_ARCH_XVERSE, "xverse" },
|
254
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
255
261
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
256
262
|
};
|
257
263
|
|
@@ -261,6 +267,7 @@ enum llm_kv {
|
|
261
267
|
LLM_KV_GENERAL_ALIGNMENT,
|
262
268
|
LLM_KV_GENERAL_NAME,
|
263
269
|
LLM_KV_GENERAL_AUTHOR,
|
270
|
+
LLM_KV_GENERAL_VERSION,
|
264
271
|
LLM_KV_GENERAL_URL,
|
265
272
|
LLM_KV_GENERAL_DESCRIPTION,
|
266
273
|
LLM_KV_GENERAL_LICENSE,
|
@@ -317,11 +324,17 @@ enum llm_kv {
|
|
317
324
|
LLM_KV_TOKENIZER_UNK_ID,
|
318
325
|
LLM_KV_TOKENIZER_SEP_ID,
|
319
326
|
LLM_KV_TOKENIZER_PAD_ID,
|
327
|
+
LLM_KV_TOKENIZER_CLS_ID,
|
328
|
+
LLM_KV_TOKENIZER_MASK_ID,
|
320
329
|
LLM_KV_TOKENIZER_ADD_BOS,
|
321
330
|
LLM_KV_TOKENIZER_ADD_EOS,
|
322
331
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
323
332
|
LLM_KV_TOKENIZER_HF_JSON,
|
324
333
|
LLM_KV_TOKENIZER_RWKV,
|
334
|
+
LLM_KV_TOKENIZER_PREFIX_ID,
|
335
|
+
LLM_KV_TOKENIZER_SUFFIX_ID,
|
336
|
+
LLM_KV_TOKENIZER_MIDDLE_ID,
|
337
|
+
LLM_KV_TOKENIZER_EOT_ID,
|
325
338
|
};
|
326
339
|
|
327
340
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
@@ -330,6 +343,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
330
343
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
331
344
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
332
345
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
346
|
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
333
347
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
334
348
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
335
349
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
@@ -386,11 +400,17 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
400
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
387
401
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
388
402
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
403
|
+
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
|
404
|
+
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
389
405
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
390
406
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
391
407
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
392
408
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
393
409
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
410
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
411
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
412
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
413
|
+
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
394
414
|
};
|
395
415
|
|
396
416
|
struct LLM_KV {
|
@@ -421,6 +441,7 @@ enum llm_tensor {
|
|
421
441
|
LLM_TENSOR_ATTN_OUT_NORM,
|
422
442
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
423
443
|
LLM_TENSOR_FFN_GATE_INP,
|
444
|
+
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
424
445
|
LLM_TENSOR_FFN_NORM,
|
425
446
|
LLM_TENSOR_FFN_GATE,
|
426
447
|
LLM_TENSOR_FFN_DOWN,
|
@@ -432,6 +453,9 @@ enum llm_tensor {
|
|
432
453
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
433
454
|
LLM_TENSOR_FFN_GATE_EXPS,
|
434
455
|
LLM_TENSOR_FFN_UP_EXPS,
|
456
|
+
LLM_TENSOR_FFN_DOWN_SHEXP,
|
457
|
+
LLM_TENSOR_FFN_GATE_SHEXP,
|
458
|
+
LLM_TENSOR_FFN_UP_SHEXP,
|
435
459
|
LLM_TENSOR_ATTN_Q_NORM,
|
436
460
|
LLM_TENSOR_ATTN_K_NORM,
|
437
461
|
LLM_TENSOR_LAYER_OUT_NORM,
|
@@ -694,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
694
718
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
695
719
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
696
720
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
721
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
722
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
697
723
|
},
|
698
724
|
},
|
699
725
|
{
|
@@ -729,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
729
755
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
730
756
|
},
|
731
757
|
},
|
758
|
+
{
|
759
|
+
LLM_ARCH_QWEN2MOE,
|
760
|
+
{
|
761
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
762
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
763
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
764
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
765
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
766
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
767
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
768
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
769
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
770
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
771
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
772
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
773
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
774
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
775
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
776
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
777
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
778
|
+
},
|
779
|
+
},
|
732
780
|
{
|
733
781
|
LLM_ARCH_PHI2,
|
734
782
|
{
|
@@ -924,6 +972,38 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
924
972
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
925
973
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
926
974
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
975
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
976
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
977
|
+
},
|
978
|
+
},
|
979
|
+
{
|
980
|
+
LLM_ARCH_DBRX,
|
981
|
+
{
|
982
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
983
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
984
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
985
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
986
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
987
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
988
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
989
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
990
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
991
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
992
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
993
|
+
},
|
994
|
+
},
|
995
|
+
{
|
996
|
+
LLM_ARCH_OLMO,
|
997
|
+
{
|
998
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
999
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1000
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1001
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1002
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1003
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1004
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1005
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1006
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
927
1007
|
},
|
928
1008
|
},
|
929
1009
|
{
|
@@ -1630,17 +1710,17 @@ static size_t llama_get_device_memory(int device) {
|
|
1630
1710
|
#if defined(GGML_USE_CUDA)
|
1631
1711
|
size_t total;
|
1632
1712
|
size_t free;
|
1633
|
-
ggml_backend_cuda_get_device_memory(device, &
|
1713
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
1634
1714
|
return free;
|
1635
1715
|
#elif defined(GGML_USE_SYCL)
|
1636
1716
|
size_t total;
|
1637
1717
|
size_t free;
|
1638
|
-
ggml_backend_sycl_get_device_memory(device, &
|
1718
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
1639
1719
|
return free;
|
1640
1720
|
#elif defined(GGML_USE_VULKAN)
|
1641
1721
|
size_t total;
|
1642
1722
|
size_t free;
|
1643
|
-
ggml_backend_vk_get_device_memory(device, &
|
1723
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
1644
1724
|
return free;
|
1645
1725
|
#else
|
1646
1726
|
return 1;
|
@@ -1682,6 +1762,7 @@ enum e_model {
|
|
1682
1762
|
MODEL_4B,
|
1683
1763
|
MODEL_7B,
|
1684
1764
|
MODEL_8B,
|
1765
|
+
MODEL_12B,
|
1685
1766
|
MODEL_13B,
|
1686
1767
|
MODEL_14B,
|
1687
1768
|
MODEL_15B,
|
@@ -1697,6 +1778,10 @@ enum e_model {
|
|
1697
1778
|
MODEL_MEDIUM,
|
1698
1779
|
MODEL_LARGE,
|
1699
1780
|
MODEL_XL,
|
1781
|
+
MODEL_A2_7B,
|
1782
|
+
MODEL_8x7B,
|
1783
|
+
MODEL_8x22B,
|
1784
|
+
MODEL_16x12B,
|
1700
1785
|
};
|
1701
1786
|
|
1702
1787
|
static const size_t kiB = 1024;
|
@@ -1880,6 +1965,12 @@ struct llama_layer {
|
|
1880
1965
|
struct ggml_tensor * ffn_down_exps;
|
1881
1966
|
struct ggml_tensor * ffn_up_exps ;
|
1882
1967
|
|
1968
|
+
// ff shared expert (shexp)
|
1969
|
+
struct ggml_tensor * ffn_gate_inp_shexp;
|
1970
|
+
struct ggml_tensor * ffn_gate_shexp;
|
1971
|
+
struct ggml_tensor * ffn_down_shexp;
|
1972
|
+
struct ggml_tensor * ffn_up_shexp;
|
1973
|
+
|
1883
1974
|
// ff bias
|
1884
1975
|
struct ggml_tensor * ffn_down_b; // b2
|
1885
1976
|
struct ggml_tensor * ffn_up_b; // b3
|
@@ -2014,20 +2105,22 @@ struct llama_vocab {
|
|
2014
2105
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2015
2106
|
|
2016
2107
|
// default LLaMA special tokens
|
2017
|
-
id special_bos_id
|
2018
|
-
id special_eos_id
|
2019
|
-
id special_unk_id
|
2020
|
-
id special_sep_id
|
2021
|
-
id special_pad_id
|
2108
|
+
id special_bos_id = 1;
|
2109
|
+
id special_eos_id = 2;
|
2110
|
+
id special_unk_id = 0;
|
2111
|
+
id special_sep_id = -1;
|
2112
|
+
id special_pad_id = -1;
|
2113
|
+
id special_cls_id = -1;
|
2114
|
+
id special_mask_id = -1;
|
2022
2115
|
|
2023
2116
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2024
2117
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2025
2118
|
|
2026
2119
|
id linefeed_id = 13;
|
2027
|
-
id special_prefix_id =
|
2028
|
-
id
|
2029
|
-
id
|
2030
|
-
id special_eot_id =
|
2120
|
+
id special_prefix_id = -1;
|
2121
|
+
id special_suffix_id = -1;
|
2122
|
+
id special_middle_id = -1;
|
2123
|
+
id special_eot_id = -1;
|
2031
2124
|
|
2032
2125
|
bool add_space_prefix = true;
|
2033
2126
|
|
@@ -2175,7 +2268,7 @@ struct llama_context {
|
|
2175
2268
|
|
2176
2269
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
2177
2270
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
2178
|
-
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
2271
|
+
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
2179
2272
|
|
2180
2273
|
bool logits_all = false;
|
2181
2274
|
|
@@ -3533,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3533
3626
|
case MODEL_3B: return "3B";
|
3534
3627
|
case MODEL_7B: return "7B";
|
3535
3628
|
case MODEL_8B: return "8B";
|
3629
|
+
case MODEL_12B: return "12B";
|
3536
3630
|
case MODEL_13B: return "13B";
|
3537
3631
|
case MODEL_14B: return "14B";
|
3538
3632
|
case MODEL_15B: return "15B";
|
@@ -3548,6 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
|
|
3548
3642
|
case MODEL_MEDIUM: return "0.4B";
|
3549
3643
|
case MODEL_LARGE: return "0.8B";
|
3550
3644
|
case MODEL_XL: return "1.5B";
|
3645
|
+
case MODEL_A2_7B: return "A2.7B";
|
3646
|
+
case MODEL_8x7B: return "8x7B";
|
3647
|
+
case MODEL_8x22B: return "8x22B";
|
3648
|
+
case MODEL_16x12B: return "16x12B";
|
3551
3649
|
default: return "?B";
|
3552
3650
|
}
|
3553
3651
|
}
|
@@ -3662,15 +3760,23 @@ static void llm_load_hparams(
|
|
3662
3760
|
{
|
3663
3761
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3664
3762
|
|
3665
|
-
|
3666
|
-
|
3667
|
-
|
3668
|
-
|
3669
|
-
|
3670
|
-
|
3671
|
-
|
3672
|
-
|
3673
|
-
|
3763
|
+
if (hparams.n_expert == 8) {
|
3764
|
+
switch (hparams.n_layer) {
|
3765
|
+
case 32: model.type = e_model::MODEL_8x7B; break;
|
3766
|
+
case 56: model.type = e_model::MODEL_8x22B; break;
|
3767
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3768
|
+
}
|
3769
|
+
} else {
|
3770
|
+
switch (hparams.n_layer) {
|
3771
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
3772
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
3773
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
3774
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
3775
|
+
case 48: model.type = e_model::MODEL_34B; break;
|
3776
|
+
case 60: model.type = e_model::MODEL_30B; break;
|
3777
|
+
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
3778
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3779
|
+
}
|
3674
3780
|
}
|
3675
3781
|
} break;
|
3676
3782
|
case LLM_ARCH_MINICPM:
|
@@ -3812,6 +3918,7 @@ static void llm_load_hparams(
|
|
3812
3918
|
switch (hparams.n_layer) {
|
3813
3919
|
case 24: model.type = e_model::MODEL_1B; break;
|
3814
3920
|
case 32: model.type = e_model::MODEL_3B; break;
|
3921
|
+
case 40: model.type = e_model::MODEL_12B; break;
|
3815
3922
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3816
3923
|
}
|
3817
3924
|
} break;
|
@@ -3836,6 +3943,14 @@ static void llm_load_hparams(
|
|
3836
3943
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3837
3944
|
}
|
3838
3945
|
} break;
|
3946
|
+
case LLM_ARCH_QWEN2MOE:
|
3947
|
+
{
|
3948
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
3949
|
+
switch (hparams.n_layer) {
|
3950
|
+
case 24: model.type = e_model::MODEL_A2_7B; break;
|
3951
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
3952
|
+
}
|
3953
|
+
} break;
|
3839
3954
|
case LLM_ARCH_PHI2:
|
3840
3955
|
{
|
3841
3956
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
@@ -3961,6 +4076,28 @@ static void llm_load_hparams(
|
|
3961
4076
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3962
4077
|
}
|
3963
4078
|
} break;
|
4079
|
+
case LLM_ARCH_DBRX:
|
4080
|
+
{
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4082
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
4083
|
+
|
4084
|
+
switch (hparams.n_layer) {
|
4085
|
+
case 40: model.type = e_model::MODEL_16x12B; break;
|
4086
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4087
|
+
}
|
4088
|
+
} break;
|
4089
|
+
case LLM_ARCH_OLMO:
|
4090
|
+
{
|
4091
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4092
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
4093
|
+
|
4094
|
+
switch (hparams.n_layer) {
|
4095
|
+
case 22: model.type = e_model::MODEL_1B; break;
|
4096
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
4097
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
4098
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4099
|
+
}
|
4100
|
+
} break;
|
3964
4101
|
default: (void)0;
|
3965
4102
|
}
|
3966
4103
|
|
@@ -3974,7 +4111,9 @@ static void llm_load_hparams(
|
|
3974
4111
|
}
|
3975
4112
|
|
3976
4113
|
// TODO: This should probably be in llama.h
|
3977
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4114
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(
|
4115
|
+
const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
|
4116
|
+
);
|
3978
4117
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
3979
4118
|
|
3980
4119
|
static void llm_load_vocab(
|
@@ -3996,23 +4135,53 @@ static void llm_load_vocab(
|
|
3996
4135
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
3997
4136
|
|
3998
4137
|
// default special tokens
|
3999
|
-
vocab.special_bos_id
|
4000
|
-
vocab.special_eos_id
|
4001
|
-
vocab.special_unk_id
|
4002
|
-
vocab.special_sep_id
|
4003
|
-
vocab.special_pad_id
|
4004
|
-
vocab.
|
4138
|
+
vocab.special_bos_id = -1;
|
4139
|
+
vocab.special_eos_id = -1;
|
4140
|
+
vocab.special_unk_id = -1;
|
4141
|
+
vocab.special_sep_id = -1;
|
4142
|
+
vocab.special_pad_id = -1;
|
4143
|
+
vocab.special_cls_id = -1;
|
4144
|
+
vocab.special_mask_id = -1;
|
4145
|
+
vocab.linefeed_id = -1;
|
4005
4146
|
|
4006
4147
|
return;
|
4007
4148
|
} else if (tokenizer_name == "llama") {
|
4008
4149
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4009
4150
|
|
4010
4151
|
// default special tokens
|
4011
|
-
vocab.special_bos_id
|
4012
|
-
vocab.special_eos_id
|
4013
|
-
vocab.special_unk_id
|
4014
|
-
vocab.special_sep_id
|
4015
|
-
vocab.special_pad_id
|
4152
|
+
vocab.special_bos_id = 1;
|
4153
|
+
vocab.special_eos_id = 2;
|
4154
|
+
vocab.special_unk_id = 0;
|
4155
|
+
vocab.special_sep_id = -1;
|
4156
|
+
vocab.special_pad_id = -1;
|
4157
|
+
vocab.special_cls_id = -1;
|
4158
|
+
vocab.special_mask_id = -1;
|
4159
|
+
|
4160
|
+
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
4161
|
+
// prior to support of FIM special tokens in GGUF, the following
|
4162
|
+
// will allow those models to continue to work. The general names
|
4163
|
+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
4164
|
+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
4165
|
+
// new versions of these models have been published.
|
4166
|
+
std::string gen_name;
|
4167
|
+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
4168
|
+
|
4169
|
+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
4170
|
+
[](unsigned char c){ return std::tolower(c); });
|
4171
|
+
|
4172
|
+
if (gen_name.find("code") != std::string::npos) {
|
4173
|
+
if (model.arch == LLM_ARCH_LLAMA) {
|
4174
|
+
vocab.special_prefix_id = 32007;
|
4175
|
+
vocab.special_suffix_id = 32008;
|
4176
|
+
vocab.special_middle_id = 32009;
|
4177
|
+
vocab.special_eot_id = 32010;
|
4178
|
+
} else if (model.arch == LLM_ARCH_GEMMA) {
|
4179
|
+
vocab.special_prefix_id = 67;
|
4180
|
+
vocab.special_suffix_id = 69;
|
4181
|
+
vocab.special_middle_id = 68;
|
4182
|
+
vocab.special_eot_id = 70;
|
4183
|
+
}
|
4184
|
+
}
|
4016
4185
|
|
4017
4186
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4018
4187
|
if (add_space_prefix_keyidx != -1) {
|
@@ -4047,20 +4216,24 @@ static void llm_load_vocab(
|
|
4047
4216
|
}
|
4048
4217
|
|
4049
4218
|
// default special tokens
|
4050
|
-
vocab.special_bos_id
|
4051
|
-
vocab.special_eos_id
|
4052
|
-
vocab.special_unk_id
|
4053
|
-
vocab.special_sep_id
|
4054
|
-
vocab.special_pad_id
|
4219
|
+
vocab.special_bos_id = 11;
|
4220
|
+
vocab.special_eos_id = 11;
|
4221
|
+
vocab.special_unk_id = -1;
|
4222
|
+
vocab.special_sep_id = -1;
|
4223
|
+
vocab.special_pad_id = -1;
|
4224
|
+
vocab.special_cls_id = -1;
|
4225
|
+
vocab.special_mask_id = -1;
|
4055
4226
|
} else if (tokenizer_name == "bert") {
|
4056
4227
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
4057
4228
|
|
4058
4229
|
// default special tokens
|
4059
|
-
vocab.special_bos_id
|
4060
|
-
vocab.special_eos_id
|
4061
|
-
vocab.special_unk_id
|
4062
|
-
vocab.special_sep_id
|
4063
|
-
vocab.special_pad_id
|
4230
|
+
vocab.special_bos_id = -1;
|
4231
|
+
vocab.special_eos_id = -1;
|
4232
|
+
vocab.special_unk_id = 100;
|
4233
|
+
vocab.special_sep_id = 102;
|
4234
|
+
vocab.special_pad_id = 0;
|
4235
|
+
vocab.special_cls_id = 101;
|
4236
|
+
vocab.special_mask_id = 103;
|
4064
4237
|
vocab.add_space_prefix = false;
|
4065
4238
|
} else {
|
4066
4239
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
@@ -4123,11 +4296,17 @@ static void llm_load_vocab(
|
|
4123
4296
|
// special tokens
|
4124
4297
|
{
|
4125
4298
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
4126
|
-
{ LLM_KV_TOKENIZER_BOS_ID,
|
4127
|
-
{ LLM_KV_TOKENIZER_EOS_ID,
|
4128
|
-
{ LLM_KV_TOKENIZER_UNK_ID,
|
4129
|
-
{ LLM_KV_TOKENIZER_SEP_ID,
|
4130
|
-
{ LLM_KV_TOKENIZER_PAD_ID,
|
4299
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
4300
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
4301
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
4302
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
4303
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
4304
|
+
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
4305
|
+
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
4306
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
4307
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
4308
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
4309
|
+
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
4131
4310
|
};
|
4132
4311
|
for (const auto & it : special_token_types) {
|
4133
4312
|
const std::string & key = kv(std::get<0>(it));
|
@@ -4319,12 +4498,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4319
4498
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
4320
4499
|
|
4321
4500
|
// special tokens
|
4322
|
-
if (vocab.special_bos_id
|
4323
|
-
if (vocab.special_eos_id
|
4324
|
-
if (vocab.special_unk_id
|
4325
|
-
if (vocab.special_sep_id
|
4326
|
-
if (vocab.special_pad_id
|
4327
|
-
if (vocab.
|
4501
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
4502
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
4503
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
4504
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
4505
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
4506
|
+
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
4507
|
+
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
4508
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
4328
4509
|
}
|
4329
4510
|
|
4330
4511
|
// Returns false if cancelled by progress_callback
|
@@ -4342,6 +4523,13 @@ static bool llm_load_tensors(
|
|
4342
4523
|
|
4343
4524
|
auto & hparams = model.hparams;
|
4344
4525
|
|
4526
|
+
#ifdef GGML_USE_SYCL
|
4527
|
+
// disable MoE with SYCL until mul_mat_id is updated
|
4528
|
+
if (hparams.n_expert > 0) {
|
4529
|
+
n_gpu_layers = 0;
|
4530
|
+
}
|
4531
|
+
#endif
|
4532
|
+
|
4345
4533
|
model.split_mode = split_mode;
|
4346
4534
|
model.main_gpu = main_gpu;
|
4347
4535
|
model.n_gpu_layers = n_gpu_layers;
|
@@ -4439,7 +4627,7 @@ static bool llm_load_tensors(
|
|
4439
4627
|
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
4440
4628
|
|
4441
4629
|
// for moe merged tensors
|
4442
|
-
ctx_size += ggml_tensor_overhead()*
|
4630
|
+
ctx_size += ggml_tensor_overhead()*n_layer*3;
|
4443
4631
|
|
4444
4632
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
4445
4633
|
for (auto & it : buft_layer_count) {
|
@@ -4635,6 +4823,39 @@ static bool llm_load_tensors(
|
|
4635
4823
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
4636
4824
|
}
|
4637
4825
|
} break;
|
4826
|
+
case LLM_ARCH_DBRX:
|
4827
|
+
{
|
4828
|
+
if (n_expert == 0) {
|
4829
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
4830
|
+
}
|
4831
|
+
|
4832
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4833
|
+
|
4834
|
+
// output
|
4835
|
+
{
|
4836
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4837
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
for (int i = 0; i < n_layer; ++i) {
|
4841
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
4842
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
4843
|
+
|
4844
|
+
auto & layer = model.layers[i];
|
4845
|
+
|
4846
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
4847
|
+
|
4848
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
4849
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4850
|
+
|
4851
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
4852
|
+
|
4853
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4854
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4855
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
4856
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
4857
|
+
}
|
4858
|
+
} break;
|
4638
4859
|
case LLM_ARCH_BAICHUAN:
|
4639
4860
|
{
|
4640
4861
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -4949,8 +5170,13 @@ static bool llm_load_tensors(
|
|
4949
5170
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
4950
5171
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
4951
5172
|
|
4952
|
-
|
4953
|
-
layer.
|
5173
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
5174
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
|
5175
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
|
5176
|
+
|
5177
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5178
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
|
5179
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
4954
5180
|
|
4955
5181
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
4956
5182
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -4993,7 +5219,13 @@ static bool llm_load_tensors(
|
|
4993
5219
|
// output
|
4994
5220
|
{
|
4995
5221
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4996
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5222
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5223
|
+
// if output is NULL, init from the input tok embed
|
5224
|
+
if (model.output == NULL) {
|
5225
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5226
|
+
ml.n_created--; // artificial tensor
|
5227
|
+
ml.size_data += ggml_nbytes(model.output);
|
5228
|
+
}
|
4997
5229
|
}
|
4998
5230
|
|
4999
5231
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5021,6 +5253,54 @@ static bool llm_load_tensors(
|
|
5021
5253
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5022
5254
|
}
|
5023
5255
|
} break;
|
5256
|
+
case LLM_ARCH_QWEN2MOE:
|
5257
|
+
{
|
5258
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5259
|
+
|
5260
|
+
// output
|
5261
|
+
{
|
5262
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5263
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5264
|
+
}
|
5265
|
+
|
5266
|
+
for (int i = 0; i < n_layer; ++i) {
|
5267
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
5268
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5269
|
+
|
5270
|
+
auto & layer = model.layers[i];
|
5271
|
+
|
5272
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5273
|
+
|
5274
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5275
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5276
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5277
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5278
|
+
|
5279
|
+
// optional bias tensors
|
5280
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5281
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5282
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
5283
|
+
|
5284
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5285
|
+
|
5286
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5287
|
+
|
5288
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
5289
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
5290
|
+
|
5291
|
+
// MoE branch
|
5292
|
+
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
5293
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5294
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5295
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5296
|
+
|
5297
|
+
// Shared expert branch
|
5298
|
+
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5299
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
5300
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
5301
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
5302
|
+
}
|
5303
|
+
} break;
|
5024
5304
|
case LLM_ARCH_PHI2:
|
5025
5305
|
{
|
5026
5306
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -5404,11 +5684,47 @@ static bool llm_load_tensors(
|
|
5404
5684
|
|
5405
5685
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5406
5686
|
|
5687
|
+
if (n_layer >= 64){
|
5688
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
5689
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
5690
|
+
}
|
5691
|
+
|
5692
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5693
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5694
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5695
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5696
|
+
|
5697
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5698
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5699
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5700
|
+
}
|
5701
|
+
} break;
|
5702
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
5703
|
+
{
|
5704
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5705
|
+
|
5706
|
+
// output
|
5707
|
+
{
|
5708
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
5709
|
+
// if output is NULL, init from the input tok embed
|
5710
|
+
if (model.output == NULL) {
|
5711
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5712
|
+
ml.n_created--; // artificial tensor
|
5713
|
+
ml.size_data += ggml_nbytes(model.output);
|
5714
|
+
}
|
5715
|
+
}
|
5716
|
+
|
5717
|
+
for (int i = 0; i < n_layer; ++i) {
|
5718
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5719
|
+
|
5720
|
+
auto & layer = model.layers[i];
|
5721
|
+
|
5407
5722
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5408
5723
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5409
5724
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5410
5725
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5411
5726
|
|
5727
|
+
|
5412
5728
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5413
5729
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5414
5730
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
@@ -5849,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
|
|
5849
6165
|
return cur;
|
5850
6166
|
}
|
5851
6167
|
|
6168
|
+
static struct ggml_tensor * llm_build_moe_ffn(
|
6169
|
+
struct ggml_context * ctx,
|
6170
|
+
struct ggml_tensor * cur,
|
6171
|
+
struct ggml_tensor * gate_inp,
|
6172
|
+
struct ggml_tensor * up_exps,
|
6173
|
+
struct ggml_tensor * gate_exps,
|
6174
|
+
struct ggml_tensor * down_exps,
|
6175
|
+
int64_t n_expert,
|
6176
|
+
int64_t n_expert_used,
|
6177
|
+
llm_ffn_op_type type_op,
|
6178
|
+
bool norm_w,
|
6179
|
+
const llm_build_cb & cb,
|
6180
|
+
int il) {
|
6181
|
+
int64_t n_embd = cur->ne[0];
|
6182
|
+
int64_t n_tokens = cur->ne[1];
|
6183
|
+
|
6184
|
+
ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
|
6185
|
+
cb(logits, "ffn_moe_logits", il);
|
6186
|
+
|
6187
|
+
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
6188
|
+
cb(probs, "ffn_moe_probs", il);
|
6189
|
+
|
6190
|
+
// select experts
|
6191
|
+
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
6192
|
+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
6193
|
+
cb(selected_experts, "ffn_moe_topk", il);
|
6194
|
+
|
6195
|
+
ggml_tensor * weights = ggml_get_rows(ctx,
|
6196
|
+
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
6197
|
+
cb(weights, "ffn_moe_weights", il);
|
6198
|
+
|
6199
|
+
if (norm_w) {
|
6200
|
+
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
|
6201
|
+
|
6202
|
+
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
6203
|
+
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6204
|
+
|
6205
|
+
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
|
6206
|
+
cb(weights, "ffn_moe_weights_norm", il);
|
6207
|
+
|
6208
|
+
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6209
|
+
}
|
6210
|
+
|
6211
|
+
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6212
|
+
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6213
|
+
cb(up, "ffn_moe_up", il);
|
6214
|
+
|
6215
|
+
ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
6216
|
+
cb(gate, "ffn_moe_gate", il);
|
6217
|
+
|
6218
|
+
switch (type_op) {
|
6219
|
+
case LLM_FFN_SILU:
|
6220
|
+
{
|
6221
|
+
gate = ggml_silu(ctx, gate);
|
6222
|
+
cb(gate, "ffn_moe_silu", il);
|
6223
|
+
} break;
|
6224
|
+
case LLM_FFN_GELU:
|
6225
|
+
{
|
6226
|
+
gate = ggml_gelu(ctx, gate);
|
6227
|
+
cb(gate, "ffn_moe_gelu", il);
|
6228
|
+
} break;
|
6229
|
+
default:
|
6230
|
+
GGML_ASSERT(false);
|
6231
|
+
}
|
6232
|
+
|
6233
|
+
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
6234
|
+
cb(par, "ffn_moe_gate_par", il);
|
6235
|
+
|
6236
|
+
ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
6237
|
+
cb(experts, "ffn_moe_down", il);
|
6238
|
+
|
6239
|
+
experts = ggml_mul(ctx, experts, weights);
|
6240
|
+
|
6241
|
+
// aggregate experts
|
6242
|
+
ggml_tensor * moe_out = nullptr;
|
6243
|
+
for (int i = 0; i < n_expert_used; ++i) {
|
6244
|
+
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
|
6245
|
+
experts->nb[2], i*experts->nb[1]);
|
6246
|
+
|
6247
|
+
if (i == 0) {
|
6248
|
+
moe_out = cur_expert;
|
6249
|
+
} else {
|
6250
|
+
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
6251
|
+
}
|
6252
|
+
}
|
6253
|
+
|
6254
|
+
if (n_expert_used == 1) {
|
6255
|
+
// avoid returning a non-contiguous tensor
|
6256
|
+
moe_out = ggml_cont(ctx, moe_out);
|
6257
|
+
}
|
6258
|
+
|
6259
|
+
return moe_out;
|
6260
|
+
}
|
6261
|
+
|
5852
6262
|
// if max_alibi_bias > 0 then apply ALiBi
|
5853
6263
|
static struct ggml_tensor * llm_build_kqv(
|
5854
6264
|
struct ggml_context * ctx,
|
@@ -6392,62 +6802,15 @@ struct llm_build_context {
|
|
6392
6802
|
LLM_NORM_RMS, cb, il);
|
6393
6803
|
cb(cur, "ffn_norm", il);
|
6394
6804
|
|
6395
|
-
|
6396
|
-
|
6397
|
-
|
6398
|
-
|
6399
|
-
|
6400
|
-
|
6401
|
-
|
6402
|
-
|
6403
|
-
cb(
|
6404
|
-
|
6405
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6406
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6407
|
-
cb(weights, "ffn_moe_weights", il);
|
6408
|
-
|
6409
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6410
|
-
|
6411
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6412
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6413
|
-
|
6414
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6415
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6416
|
-
|
6417
|
-
// compute expert outputs
|
6418
|
-
ggml_tensor * moe_out = nullptr;
|
6419
|
-
|
6420
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6421
|
-
ggml_tensor * cur_expert;
|
6422
|
-
|
6423
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6424
|
-
cb(cur_up, "ffn_moe_up", il);
|
6425
|
-
|
6426
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6427
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6428
|
-
|
6429
|
-
cur_gate = ggml_silu(ctx0, cur_gate);
|
6430
|
-
cb(cur_gate, "ffn_moe_silu", il);
|
6431
|
-
|
6432
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6433
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6434
|
-
|
6435
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6436
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6437
|
-
|
6438
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6439
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6440
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6441
|
-
|
6442
|
-
if (i == 0) {
|
6443
|
-
moe_out = cur_expert;
|
6444
|
-
} else {
|
6445
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6446
|
-
cb(moe_out, "ffn_moe_out", il);
|
6447
|
-
}
|
6448
|
-
}
|
6449
|
-
|
6450
|
-
cur = moe_out;
|
6805
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
6806
|
+
model.layers[il].ffn_gate_inp,
|
6807
|
+
model.layers[il].ffn_up_exps,
|
6808
|
+
model.layers[il].ffn_gate_exps,
|
6809
|
+
model.layers[il].ffn_down_exps,
|
6810
|
+
n_expert, n_expert_used,
|
6811
|
+
LLM_FFN_SILU, true,
|
6812
|
+
cb, il);
|
6813
|
+
cb(cur, "ffn_moe_out", il);
|
6451
6814
|
}
|
6452
6815
|
|
6453
6816
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
@@ -6926,63 +7289,15 @@ struct llm_build_context {
|
|
6926
7289
|
LLM_NORM_RMS, cb, il);
|
6927
7290
|
cb(cur, "ffn_norm", il);
|
6928
7291
|
|
6929
|
-
|
6930
|
-
|
6931
|
-
|
6932
|
-
|
6933
|
-
|
6934
|
-
|
6935
|
-
|
6936
|
-
|
6937
|
-
cb(
|
6938
|
-
|
6939
|
-
ggml_tensor * weights = ggml_get_rows(ctx0,
|
6940
|
-
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
6941
|
-
cb(weights, "ffn_moe_weights", il);
|
6942
|
-
|
6943
|
-
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
6944
|
-
|
6945
|
-
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
6946
|
-
cb(weights_sum, "ffn_moe_weights_sum", il);
|
6947
|
-
|
6948
|
-
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
6949
|
-
cb(weights, "ffn_moe_weights_norm", il);
|
6950
|
-
|
6951
|
-
// compute expert outputs
|
6952
|
-
ggml_tensor * moe_out = nullptr;
|
6953
|
-
|
6954
|
-
for (int i = 0; i < n_expert_used; ++i) {
|
6955
|
-
ggml_tensor * cur_expert;
|
6956
|
-
|
6957
|
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
6958
|
-
cb(cur_up, "ffn_moe_up", il);
|
6959
|
-
|
6960
|
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
6961
|
-
cb(cur_gate, "ffn_moe_gate", il);
|
6962
|
-
|
6963
|
-
//GeLU
|
6964
|
-
cur_gate = ggml_gelu(ctx0, cur_gate);
|
6965
|
-
cb(cur_gate, "ffn_moe_gelu", il);
|
6966
|
-
|
6967
|
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
6968
|
-
cb(cur_expert, "ffn_moe_gate_par", il);
|
6969
|
-
|
6970
|
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
6971
|
-
cb(cur_expert, "ffn_moe_down", il);
|
6972
|
-
|
6973
|
-
cur_expert = ggml_mul(ctx0, cur_expert,
|
6974
|
-
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
6975
|
-
cb(cur_expert, "ffn_moe_weighted", il);
|
6976
|
-
|
6977
|
-
if (i == 0) {
|
6978
|
-
moe_out = cur_expert;
|
6979
|
-
} else {
|
6980
|
-
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
6981
|
-
cb(moe_out, "ffn_moe_out", il);
|
6982
|
-
}
|
6983
|
-
}
|
6984
|
-
|
6985
|
-
cur = moe_out;
|
7292
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7293
|
+
model.layers[il].ffn_gate_inp,
|
7294
|
+
model.layers[il].ffn_up_exps,
|
7295
|
+
model.layers[il].ffn_gate_exps,
|
7296
|
+
model.layers[il].ffn_down_exps,
|
7297
|
+
n_expert, n_expert_used,
|
7298
|
+
LLM_FFN_GELU, true,
|
7299
|
+
cb, il);
|
7300
|
+
cb(cur, "ffn_moe_out", il);
|
6986
7301
|
|
6987
7302
|
// Grok
|
6988
7303
|
// if layer_out_norm is present then apply it before adding the input
|
@@ -6994,7 +7309,6 @@ struct llm_build_context {
|
|
6994
7309
|
cb(cur, "layer_out_norm", il);
|
6995
7310
|
}
|
6996
7311
|
|
6997
|
-
|
6998
7312
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
6999
7313
|
cb(cur, "ffn_out", il);
|
7000
7314
|
|
@@ -7030,12 +7344,16 @@ struct llm_build_context {
|
|
7030
7344
|
return gf;
|
7031
7345
|
}
|
7032
7346
|
|
7033
|
-
struct ggml_cgraph *
|
7347
|
+
struct ggml_cgraph * build_dbrx() {
|
7034
7348
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7035
7349
|
|
7350
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
7351
|
+
int32_t n_tokens = this->n_tokens;
|
7352
|
+
|
7036
7353
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7037
7354
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7038
7355
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7356
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
7039
7357
|
|
7040
7358
|
struct ggml_tensor * cur;
|
7041
7359
|
struct ggml_tensor * inpL;
|
@@ -7048,16 +7366,140 @@ struct llm_build_context {
|
|
7048
7366
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7049
7367
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7050
7368
|
|
7051
|
-
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7052
|
-
cb(pos, "pos_embd", -1);
|
7053
|
-
|
7054
|
-
inpL = ggml_add(ctx0, inpL, pos);
|
7055
|
-
cb(inpL, "inpL", -1);
|
7056
|
-
|
7057
7369
|
for (int il = 0; il < n_layer; ++il) {
|
7370
|
+
struct ggml_tensor * inpSA = inpL;
|
7371
|
+
|
7372
|
+
// norm
|
7058
7373
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7059
|
-
|
7060
|
-
|
7374
|
+
model.layers[il].attn_norm, NULL,
|
7375
|
+
LLM_NORM, cb, il);
|
7376
|
+
cb(cur, "attn_norm", il);
|
7377
|
+
|
7378
|
+
// self-attention
|
7379
|
+
{
|
7380
|
+
struct ggml_tensor * Qcur = nullptr;
|
7381
|
+
struct ggml_tensor * Kcur = nullptr;
|
7382
|
+
struct ggml_tensor * Vcur = nullptr;
|
7383
|
+
|
7384
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
7385
|
+
cb(cur, "wqkv", il);
|
7386
|
+
|
7387
|
+
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
7388
|
+
cb(cur, "wqkv_clamped", il);
|
7389
|
+
|
7390
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
7391
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
7392
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
7393
|
+
|
7394
|
+
cb(Qcur, "Qcur", il);
|
7395
|
+
cb(Kcur, "Kcur", il);
|
7396
|
+
cb(Vcur, "Vcur", il);
|
7397
|
+
|
7398
|
+
Qcur = ggml_rope_custom(
|
7399
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7402
|
+
);
|
7403
|
+
cb(Qcur, "Qcur", il);
|
7404
|
+
|
7405
|
+
Kcur = ggml_rope_custom(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7407
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7408
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
7409
|
+
);
|
7410
|
+
cb(Kcur, "Kcur", il);
|
7411
|
+
|
7412
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
7413
|
+
model.layers[il].wo, NULL,
|
7414
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
7415
|
+
}
|
7416
|
+
|
7417
|
+
if (il == n_layer - 1) {
|
7418
|
+
// skip computing output for unused tokens
|
7419
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7420
|
+
n_tokens = n_outputs;
|
7421
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
7422
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7423
|
+
}
|
7424
|
+
|
7425
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
7426
|
+
cb(ffn_inp, "ffn_inp", il);
|
7427
|
+
|
7428
|
+
// feed-forward network
|
7429
|
+
// MoE branch
|
7430
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
7431
|
+
model.layers[il].attn_out_norm, NULL,
|
7432
|
+
LLM_NORM, cb, il);
|
7433
|
+
cb(cur, "attn_out_norm", il);
|
7434
|
+
|
7435
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
7436
|
+
model.layers[il].ffn_gate_inp,
|
7437
|
+
model.layers[il].ffn_up_exps,
|
7438
|
+
model.layers[il].ffn_gate_exps,
|
7439
|
+
model.layers[il].ffn_down_exps,
|
7440
|
+
n_expert, n_expert_used,
|
7441
|
+
LLM_FFN_SILU, true,
|
7442
|
+
cb, il);
|
7443
|
+
cb(cur, "ffn_moe_out", il);
|
7444
|
+
|
7445
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
7446
|
+
cb(cur, "ffn_out", il);
|
7447
|
+
|
7448
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
7449
|
+
if (layer_dir != nullptr) {
|
7450
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
7451
|
+
}
|
7452
|
+
cb(cur, "l_out", il);
|
7453
|
+
|
7454
|
+
// input for next layer
|
7455
|
+
inpL = cur;
|
7456
|
+
}
|
7457
|
+
|
7458
|
+
cur = inpL;
|
7459
|
+
|
7460
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
7461
|
+
model.output_norm, NULL,
|
7462
|
+
LLM_NORM, cb, -1);
|
7463
|
+
cb(cur, "result_norm", -1);
|
7464
|
+
|
7465
|
+
// lm_head
|
7466
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
7467
|
+
|
7468
|
+
cb(cur, "result_output", -1);
|
7469
|
+
|
7470
|
+
ggml_build_forward_expand(gf, cur);
|
7471
|
+
|
7472
|
+
return gf;
|
7473
|
+
}
|
7474
|
+
|
7475
|
+
struct ggml_cgraph * build_starcoder() {
|
7476
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7477
|
+
|
7478
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7479
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
7480
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7481
|
+
|
7482
|
+
struct ggml_tensor * cur;
|
7483
|
+
struct ggml_tensor * inpL;
|
7484
|
+
|
7485
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7486
|
+
|
7487
|
+
// inp_pos - contains the positions
|
7488
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
7489
|
+
|
7490
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7491
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7492
|
+
|
7493
|
+
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
7494
|
+
cb(pos, "pos_embd", -1);
|
7495
|
+
|
7496
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
7497
|
+
cb(inpL, "inpL", -1);
|
7498
|
+
|
7499
|
+
for (int il = 0; il < n_layer; ++il) {
|
7500
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
7501
|
+
model.layers[il].attn_norm,
|
7502
|
+
model.layers[il].attn_norm_b,
|
7061
7503
|
LLM_NORM, cb, il);
|
7062
7504
|
cb(cur, "attn_norm", il);
|
7063
7505
|
|
@@ -7882,7 +8324,7 @@ struct llm_build_context {
|
|
7882
8324
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7883
8325
|
|
7884
8326
|
for (int il = 0; il < n_layer; ++il) {
|
7885
|
-
|
8327
|
+
|
7886
8328
|
|
7887
8329
|
// norm
|
7888
8330
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
@@ -7891,6 +8333,8 @@ struct llm_build_context {
|
|
7891
8333
|
LLM_NORM, cb, il);
|
7892
8334
|
cb(cur, "attn_norm", il);
|
7893
8335
|
|
8336
|
+
struct ggml_tensor * inpSA = cur;
|
8337
|
+
|
7894
8338
|
// self-attention
|
7895
8339
|
{
|
7896
8340
|
// compute Q and K and RoPE them
|
@@ -7915,15 +8359,36 @@ struct llm_build_context {
|
|
7915
8359
|
cb(Vcur, "Vcur", il);
|
7916
8360
|
}
|
7917
8361
|
|
8362
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8363
|
+
cb(Qcur, "Qcur", il);
|
8364
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8365
|
+
cb(Kcur, "Kcur", il);
|
8366
|
+
|
8367
|
+
if (model.layers[il].attn_q_norm) {
|
8368
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
8369
|
+
model.layers[il].attn_q_norm,
|
8370
|
+
NULL,
|
8371
|
+
LLM_NORM, cb, il);
|
8372
|
+
cb(Qcur, "Qcur", il);
|
8373
|
+
}
|
8374
|
+
if (model.layers[il].attn_k_norm) {
|
8375
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
8376
|
+
model.layers[il].attn_k_norm,
|
8377
|
+
NULL,
|
8378
|
+
LLM_NORM, cb, il);
|
8379
|
+
cb(Kcur, "Kcur", il);
|
8380
|
+
}
|
8381
|
+
|
8382
|
+
|
7918
8383
|
Qcur = ggml_rope_custom(
|
7919
|
-
ctx0,
|
8384
|
+
ctx0, Qcur, inp_pos,
|
7920
8385
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7921
8386
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7922
8387
|
);
|
7923
8388
|
cb(Qcur, "Qcur", il);
|
7924
8389
|
|
7925
8390
|
Kcur = ggml_rope_custom(
|
7926
|
-
ctx0,
|
8391
|
+
ctx0, Kcur, inp_pos,
|
7927
8392
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7928
8393
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7929
8394
|
);
|
@@ -7938,20 +8403,25 @@ struct llm_build_context {
|
|
7938
8403
|
// skip computing output for unused tokens
|
7939
8404
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
7940
8405
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8406
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
7941
8407
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
7942
8408
|
}
|
7943
8409
|
|
7944
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur,
|
8410
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
7945
8411
|
cb(ffn_inp, "ffn_inp", il);
|
7946
8412
|
|
7947
8413
|
// feed-forward network
|
7948
8414
|
{
|
7949
|
-
|
7950
|
-
|
7951
|
-
|
7952
|
-
|
7953
|
-
|
7954
|
-
|
8415
|
+
if (model.layers[il].ffn_norm) {
|
8416
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8417
|
+
model.layers[il].ffn_norm,
|
8418
|
+
model.layers[il].ffn_norm_b,
|
8419
|
+
LLM_NORM, cb, il);
|
8420
|
+
cb(cur, "ffn_norm", il);
|
8421
|
+
} else {
|
8422
|
+
// parallel residual
|
8423
|
+
cur = inpSA;
|
8424
|
+
}
|
7955
8425
|
cur = llm_build_ffn(ctx0, cur,
|
7956
8426
|
model.layers[il].ffn_up, NULL,
|
7957
8427
|
model.layers[il].ffn_gate, NULL,
|
@@ -8141,12 +8611,6 @@ struct llm_build_context {
|
|
8141
8611
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8142
8612
|
cb(Vcur, "Vcur", il);
|
8143
8613
|
|
8144
|
-
// these nodes are added to the graph together so that they are not reordered
|
8145
|
-
// by doing so, the number of splits in the graph is reduced
|
8146
|
-
ggml_build_forward_expand(gf, Qcur);
|
8147
|
-
ggml_build_forward_expand(gf, Kcur);
|
8148
|
-
ggml_build_forward_expand(gf, Vcur);
|
8149
|
-
|
8150
8614
|
Qcur = ggml_rope_custom(
|
8151
8615
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8152
8616
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -8213,6 +8677,150 @@ struct llm_build_context {
|
|
8213
8677
|
return gf;
|
8214
8678
|
}
|
8215
8679
|
|
8680
|
+
struct ggml_cgraph * build_qwen2moe() {
|
8681
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8682
|
+
|
8683
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
8684
|
+
int32_t n_tokens = this->n_tokens;
|
8685
|
+
|
8686
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8687
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8688
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
8689
|
+
|
8690
|
+
struct ggml_tensor * cur;
|
8691
|
+
struct ggml_tensor * inpL;
|
8692
|
+
|
8693
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8694
|
+
|
8695
|
+
// inp_pos - contains the positions
|
8696
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
8697
|
+
|
8698
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8699
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8700
|
+
|
8701
|
+
for (int il = 0; il < n_layer; ++il) {
|
8702
|
+
struct ggml_tensor * inpSA = inpL;
|
8703
|
+
|
8704
|
+
// norm
|
8705
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
8706
|
+
model.layers[il].attn_norm, NULL,
|
8707
|
+
LLM_NORM_RMS, cb, il);
|
8708
|
+
cb(cur, "attn_norm", il);
|
8709
|
+
|
8710
|
+
// self_attention
|
8711
|
+
{
|
8712
|
+
// compute Q and K and RoPE them
|
8713
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8714
|
+
cb(Qcur, "Qcur", il);
|
8715
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
8716
|
+
cb(Qcur, "Qcur", il);
|
8717
|
+
|
8718
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8719
|
+
cb(Kcur, "Kcur", il);
|
8720
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
8721
|
+
cb(Kcur, "Kcur", il);
|
8722
|
+
|
8723
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8724
|
+
cb(Vcur, "Vcur", il);
|
8725
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8726
|
+
cb(Vcur, "Vcur", il);
|
8727
|
+
|
8728
|
+
Qcur = ggml_rope_custom(
|
8729
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8730
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8731
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8732
|
+
);
|
8733
|
+
cb(Qcur, "Qcur", il);
|
8734
|
+
|
8735
|
+
Kcur = ggml_rope_custom(
|
8736
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8737
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8738
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
8739
|
+
);
|
8740
|
+
cb(Kcur, "Kcur", il);
|
8741
|
+
|
8742
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
8743
|
+
model.layers[il].wo, model.layers[il].bo,
|
8744
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8745
|
+
}
|
8746
|
+
|
8747
|
+
if (il == n_layer - 1) {
|
8748
|
+
// skip computing output for unused tokens
|
8749
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8750
|
+
n_tokens = n_outputs;
|
8751
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8752
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
8753
|
+
}
|
8754
|
+
|
8755
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
8756
|
+
cb(ffn_inp, "ffn_inp", il);
|
8757
|
+
|
8758
|
+
// MoE branch
|
8759
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8760
|
+
model.layers[il].ffn_norm, NULL,
|
8761
|
+
LLM_NORM_RMS, cb, il);
|
8762
|
+
cb(cur, "ffn_norm", il);
|
8763
|
+
|
8764
|
+
ggml_tensor * moe_out =
|
8765
|
+
llm_build_moe_ffn(ctx0, cur,
|
8766
|
+
model.layers[il].ffn_gate_inp,
|
8767
|
+
model.layers[il].ffn_up_exps,
|
8768
|
+
model.layers[il].ffn_gate_exps,
|
8769
|
+
model.layers[il].ffn_down_exps,
|
8770
|
+
n_expert, n_expert_used,
|
8771
|
+
LLM_FFN_SILU, false,
|
8772
|
+
cb, il);
|
8773
|
+
cb(cur, "ffn_moe_out", il);
|
8774
|
+
|
8775
|
+
// FFN shared expert
|
8776
|
+
{
|
8777
|
+
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
8778
|
+
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
8779
|
+
|
8780
|
+
// sigmoid
|
8781
|
+
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
8782
|
+
cb(cur_gate, "ffn_shexp_gate", il);
|
8783
|
+
|
8784
|
+
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
8785
|
+
model.layers[il].ffn_up_shexp, NULL,
|
8786
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
8787
|
+
model.layers[il].ffn_down_shexp, NULL,
|
8788
|
+
NULL,
|
8789
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
8790
|
+
cb(cur_ffn, "ffn_shexp", il);
|
8791
|
+
|
8792
|
+
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
8793
|
+
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
8794
|
+
|
8795
|
+
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
8796
|
+
cb(moe_out, "ffn_out", il);
|
8797
|
+
|
8798
|
+
cur = moe_out;
|
8799
|
+
}
|
8800
|
+
|
8801
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
8802
|
+
cb(cur, "l_out", il);
|
8803
|
+
|
8804
|
+
// input for next layer
|
8805
|
+
inpL = cur;
|
8806
|
+
}
|
8807
|
+
|
8808
|
+
cur = inpL;
|
8809
|
+
|
8810
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
8811
|
+
model.output_norm, NULL,
|
8812
|
+
LLM_NORM_RMS, cb, -1);
|
8813
|
+
cb(cur, "result_norm", -1);
|
8814
|
+
|
8815
|
+
// lm_head
|
8816
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8817
|
+
cb(cur, "result_output", -1);
|
8818
|
+
|
8819
|
+
ggml_build_forward_expand(gf, cur);
|
8820
|
+
|
8821
|
+
return gf;
|
8822
|
+
}
|
8823
|
+
|
8216
8824
|
struct ggml_cgraph * build_phi2() {
|
8217
8825
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8218
8826
|
|
@@ -9452,6 +10060,31 @@ struct llm_build_context {
|
|
9452
10060
|
cb(Vcur, "Vcur", il);
|
9453
10061
|
}
|
9454
10062
|
|
10063
|
+
if (model.layers[il].attn_q_norm) {
|
10064
|
+
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
10065
|
+
ggml_element_size(Qcur) * n_embd_head,
|
10066
|
+
ggml_element_size(Qcur) * n_embd_head * n_head,
|
10067
|
+
0);
|
10068
|
+
cb(Qcur, "Qcur", il);
|
10069
|
+
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
10070
|
+
ggml_element_size(Kcur) * n_embd_head,
|
10071
|
+
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
10072
|
+
0);
|
10073
|
+
cb(Kcur, "Kcur", il);
|
10074
|
+
|
10075
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
10076
|
+
model.layers[il].attn_q_norm,
|
10077
|
+
NULL,
|
10078
|
+
LLM_NORM, cb, il);
|
10079
|
+
cb(Qcur, "Qcur", il);
|
10080
|
+
|
10081
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
10082
|
+
model.layers[il].attn_k_norm,
|
10083
|
+
NULL,
|
10084
|
+
LLM_NORM, cb, il);
|
10085
|
+
cb(Kcur, "Kcur", il);
|
10086
|
+
}
|
10087
|
+
|
9455
10088
|
Qcur = ggml_rope_custom(
|
9456
10089
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9457
10090
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
@@ -9522,6 +10155,139 @@ struct llm_build_context {
|
|
9522
10155
|
return gf;
|
9523
10156
|
|
9524
10157
|
}
|
10158
|
+
|
10159
|
+
// ref: https://allenai.org/olmo
|
10160
|
+
// based on the original build_llama() function, changes:
|
10161
|
+
// * non-parametric layer norm
|
10162
|
+
// * clamp qkv
|
10163
|
+
// * removed bias
|
10164
|
+
// * removed MoE
|
10165
|
+
struct ggml_cgraph * build_olmo() {
|
10166
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10167
|
+
|
10168
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10169
|
+
int32_t n_tokens = this->n_tokens;
|
10170
|
+
|
10171
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10172
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10173
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10174
|
+
|
10175
|
+
struct ggml_tensor * cur;
|
10176
|
+
struct ggml_tensor * inpL;
|
10177
|
+
|
10178
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10179
|
+
|
10180
|
+
// inp_pos - contains the positions
|
10181
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10182
|
+
|
10183
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10184
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10185
|
+
|
10186
|
+
for (int il = 0; il < n_layer; ++il) {
|
10187
|
+
struct ggml_tensor * inpSA = inpL;
|
10188
|
+
|
10189
|
+
// norm
|
10190
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10191
|
+
NULL, NULL,
|
10192
|
+
LLM_NORM, cb, il);
|
10193
|
+
cb(cur, "attn_norm", il);
|
10194
|
+
|
10195
|
+
// self-attention
|
10196
|
+
{
|
10197
|
+
// compute Q and K and RoPE them
|
10198
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10199
|
+
cb(Qcur, "Qcur", il);
|
10200
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10201
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10202
|
+
cb(Qcur, "Qcur", il);
|
10203
|
+
}
|
10204
|
+
|
10205
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10206
|
+
cb(Kcur, "Kcur", il);
|
10207
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10208
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10209
|
+
cb(Kcur, "Kcur", il);
|
10210
|
+
}
|
10211
|
+
|
10212
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10213
|
+
cb(Vcur, "Vcur", il);
|
10214
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
10215
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
10216
|
+
cb(Vcur, "Vcur", il);
|
10217
|
+
}
|
10218
|
+
|
10219
|
+
Qcur = ggml_rope_custom(
|
10220
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10221
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10222
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10223
|
+
);
|
10224
|
+
cb(Qcur, "Qcur", il);
|
10225
|
+
|
10226
|
+
Kcur = ggml_rope_custom(
|
10227
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10228
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10229
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10230
|
+
);
|
10231
|
+
cb(Kcur, "Kcur", il);
|
10232
|
+
|
10233
|
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
10234
|
+
model.layers[il].wo, nullptr,
|
10235
|
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10236
|
+
}
|
10237
|
+
|
10238
|
+
if (il == n_layer - 1) {
|
10239
|
+
// skip computing output for unused tokens
|
10240
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10241
|
+
n_tokens = n_outputs;
|
10242
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10243
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10244
|
+
}
|
10245
|
+
|
10246
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10247
|
+
cb(ffn_inp, "ffn_inp", il);
|
10248
|
+
|
10249
|
+
// feed-forward network
|
10250
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10251
|
+
NULL, NULL,
|
10252
|
+
LLM_NORM, cb, il);
|
10253
|
+
cb(cur, "ffn_norm", il);
|
10254
|
+
|
10255
|
+
cur = llm_build_ffn(ctx0, cur,
|
10256
|
+
model.layers[il].ffn_up, NULL,
|
10257
|
+
model.layers[il].ffn_gate, NULL,
|
10258
|
+
model.layers[il].ffn_down, NULL,
|
10259
|
+
NULL,
|
10260
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10261
|
+
cb(cur, "ffn_out", il);
|
10262
|
+
|
10263
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
10264
|
+
cb(cur, "ffn_out", il);
|
10265
|
+
|
10266
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10267
|
+
if (layer_dir != nullptr) {
|
10268
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10269
|
+
}
|
10270
|
+
cb(cur, "l_out", il);
|
10271
|
+
|
10272
|
+
// input for next layer
|
10273
|
+
inpL = cur;
|
10274
|
+
}
|
10275
|
+
|
10276
|
+
cur = inpL;
|
10277
|
+
|
10278
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10279
|
+
NULL, NULL,
|
10280
|
+
LLM_NORM, cb, -1);
|
10281
|
+
cb(cur, "result_norm", -1);
|
10282
|
+
|
10283
|
+
// lm_head
|
10284
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10285
|
+
cb(cur, "result_output", -1);
|
10286
|
+
|
10287
|
+
ggml_build_forward_expand(gf, cur);
|
10288
|
+
|
10289
|
+
return gf;
|
10290
|
+
}
|
9525
10291
|
};
|
9526
10292
|
|
9527
10293
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -9671,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9671
10437
|
{
|
9672
10438
|
result = llm.build_qwen2();
|
9673
10439
|
} break;
|
10440
|
+
case LLM_ARCH_QWEN2MOE:
|
10441
|
+
{
|
10442
|
+
result = llm.build_qwen2moe();
|
10443
|
+
} break;
|
9674
10444
|
case LLM_ARCH_PHI2:
|
9675
10445
|
{
|
9676
10446
|
result = llm.build_phi2();
|
@@ -9715,9 +10485,17 @@ static struct ggml_cgraph * llama_build_graph(
|
|
9715
10485
|
{
|
9716
10486
|
result = llm.build_xverse();
|
9717
10487
|
} break;
|
9718
|
-
case LLM_ARCH_COMMAND_R:
|
10488
|
+
case LLM_ARCH_COMMAND_R:
|
10489
|
+
{
|
10490
|
+
result = llm.build_command_r();
|
10491
|
+
} break;
|
10492
|
+
case LLM_ARCH_DBRX:
|
10493
|
+
{
|
10494
|
+
result = llm.build_dbrx();
|
10495
|
+
} break;
|
10496
|
+
case LLM_ARCH_OLMO:
|
9719
10497
|
{
|
9720
|
-
result = llm.
|
10498
|
+
result = llm.build_olmo();
|
9721
10499
|
} break;
|
9722
10500
|
default:
|
9723
10501
|
GGML_ASSERT(false);
|
@@ -10409,6 +11187,9 @@ static int llama_decode_internal(
|
|
10409
11187
|
n_outputs_prev += lctx.n_outputs;
|
10410
11188
|
}
|
10411
11189
|
|
11190
|
+
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
11191
|
+
lctx.n_outputs = n_outputs;
|
11192
|
+
|
10412
11193
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
10413
11194
|
//llama_synchronize(&lctx);
|
10414
11195
|
|
@@ -11052,7 +11833,7 @@ struct llm_tokenizer_bpe {
|
|
11052
11833
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
11053
11834
|
}
|
11054
11835
|
|
11055
|
-
// add the
|
11836
|
+
// add the finished tokens to the final list keeping correct order for next and prev
|
11056
11837
|
for (auto & sym : symbols) {
|
11057
11838
|
if (sym.n > 0) {
|
11058
11839
|
sym.prev = final_prev_index;
|
@@ -11321,9 +12102,6 @@ struct llm_tokenizer_wpm {
|
|
11321
12102
|
output.push_back(vocab.special_unk_id);
|
11322
12103
|
}
|
11323
12104
|
}
|
11324
|
-
|
11325
|
-
// append eos token
|
11326
|
-
output.push_back(vocab.special_eos_id);
|
11327
12105
|
}
|
11328
12106
|
|
11329
12107
|
std::vector<std::string> preprocess(const std::string & text) {
|
@@ -11528,30 +12306,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
11528
12306
|
}
|
11529
12307
|
}
|
11530
12308
|
|
11531
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool
|
12309
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
|
11532
12310
|
std::vector<llama_vocab::id> output;
|
11533
|
-
|
11534
|
-
// OG tokenizer behavior:
|
11535
|
-
//
|
11536
|
-
// tokenizer.encode('', add_bos=True) returns [1]
|
11537
|
-
// tokenizer.encode('', add_bos=False) returns []
|
11538
|
-
|
11539
|
-
if (bos && vocab.special_bos_id != -1) {
|
11540
|
-
output.push_back(vocab.special_bos_id);
|
11541
|
-
}
|
11542
|
-
|
11543
|
-
if (raw_text.empty()) {
|
11544
|
-
return output;
|
11545
|
-
}
|
11546
|
-
|
11547
12311
|
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
11548
|
-
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
11549
12312
|
|
11550
|
-
if (
|
12313
|
+
if (!raw_text.empty()) {
|
12314
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
12315
|
+
if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
|
12316
|
+
}
|
11551
12317
|
|
11552
12318
|
switch (vocab.type) {
|
11553
12319
|
case LLAMA_VOCAB_TYPE_SPM:
|
11554
12320
|
{
|
12321
|
+
// OG tokenizer behavior:
|
12322
|
+
//
|
12323
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12324
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
12325
|
+
|
12326
|
+
if (add_special && vocab.special_add_bos != 0) {
|
12327
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
12328
|
+
output.push_back(vocab.special_bos_id);
|
12329
|
+
}
|
12330
|
+
|
11555
12331
|
for (const auto & fragment : fragment_buffer) {
|
11556
12332
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11557
12333
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
@@ -11577,9 +12353,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11577
12353
|
output.push_back(fragment.token);
|
11578
12354
|
}
|
11579
12355
|
}
|
12356
|
+
|
12357
|
+
if (add_special && vocab.special_add_eos == 1) {
|
12358
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
12359
|
+
output.push_back(vocab.special_eos_id);
|
12360
|
+
}
|
11580
12361
|
} break;
|
11581
12362
|
case LLAMA_VOCAB_TYPE_BPE:
|
11582
12363
|
{
|
12364
|
+
if (add_special && vocab.special_add_bos == 1) {
|
12365
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
12366
|
+
output.push_back(vocab.special_bos_id);
|
12367
|
+
}
|
12368
|
+
|
11583
12369
|
for (const auto & fragment : fragment_buffer) {
|
11584
12370
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11585
12371
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11593,9 +12379,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11593
12379
|
output.push_back(fragment.token);
|
11594
12380
|
}
|
11595
12381
|
}
|
12382
|
+
|
12383
|
+
GGML_ASSERT(vocab.special_add_eos != 1);
|
11596
12384
|
} break;
|
11597
12385
|
case LLAMA_VOCAB_TYPE_WPM:
|
11598
12386
|
{
|
12387
|
+
if (add_special) {
|
12388
|
+
GGML_ASSERT(vocab.special_cls_id != -1);
|
12389
|
+
output.push_back(vocab.special_cls_id);
|
12390
|
+
}
|
12391
|
+
|
11599
12392
|
for (const auto & fragment : fragment_buffer) {
|
11600
12393
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
11601
12394
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -11609,6 +12402,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
11609
12402
|
output.push_back(fragment.token);
|
11610
12403
|
}
|
11611
12404
|
}
|
12405
|
+
|
12406
|
+
if (add_special) {
|
12407
|
+
GGML_ASSERT(vocab.special_sep_id != -1);
|
12408
|
+
output.push_back(vocab.special_sep_id);
|
12409
|
+
}
|
11612
12410
|
} break;
|
11613
12411
|
case LLAMA_VOCAB_TYPE_NONE:
|
11614
12412
|
GGML_ASSERT(false);
|
@@ -11775,7 +12573,9 @@ static void llama_grammar_advance_stack(
|
|
11775
12573
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11776
12574
|
|
11777
12575
|
if (stack.empty()) {
|
11778
|
-
new_stacks.
|
12576
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
12577
|
+
new_stacks.emplace_back(stack);
|
12578
|
+
}
|
11779
12579
|
return;
|
11780
12580
|
}
|
11781
12581
|
|
@@ -11812,7 +12612,10 @@ static void llama_grammar_advance_stack(
|
|
11812
12612
|
}
|
11813
12613
|
case LLAMA_GRETYPE_CHAR:
|
11814
12614
|
case LLAMA_GRETYPE_CHAR_NOT:
|
11815
|
-
new_stacks.
|
12615
|
+
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
12616
|
+
// only add the stack if it's not a duplicate of one we already have
|
12617
|
+
new_stacks.emplace_back(stack);
|
12618
|
+
}
|
11816
12619
|
break;
|
11817
12620
|
default:
|
11818
12621
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -11826,12 +12629,13 @@ static void llama_grammar_advance_stack(
|
|
11826
12629
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
11827
12630
|
// produces the N possible stacks if the given char is accepted at those
|
11828
12631
|
// positions
|
11829
|
-
|
12632
|
+
void llama_grammar_accept(
|
11830
12633
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
11831
12634
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
11832
|
-
const uint32_t chr
|
12635
|
+
const uint32_t chr,
|
12636
|
+
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
11833
12637
|
|
11834
|
-
|
12638
|
+
new_stacks.clear();
|
11835
12639
|
|
11836
12640
|
for (const auto & stack : stacks) {
|
11837
12641
|
if (stack.empty()) {
|
@@ -11850,8 +12654,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|
11850
12654
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
11851
12655
|
}
|
11852
12656
|
}
|
11853
|
-
|
11854
|
-
return new_stacks;
|
11855
12657
|
}
|
11856
12658
|
|
11857
12659
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
@@ -11865,6 +12667,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11865
12667
|
const std::vector<llama_grammar_candidate> & candidates) {
|
11866
12668
|
|
11867
12669
|
std::vector<llama_grammar_candidate> rejects;
|
12670
|
+
rejects.reserve(candidates.size());
|
11868
12671
|
|
11869
12672
|
if (stack.empty()) {
|
11870
12673
|
for (const auto & tok : candidates) {
|
@@ -11878,6 +12681,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
11878
12681
|
const llama_grammar_element * stack_pos = stack.back();
|
11879
12682
|
|
11880
12683
|
std::vector<llama_grammar_candidate> next_candidates;
|
12684
|
+
next_candidates.reserve(candidates.size());
|
12685
|
+
|
11881
12686
|
for (const auto & tok : candidates) {
|
11882
12687
|
if (*tok.code_points == 0) {
|
11883
12688
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
@@ -12685,8 +13490,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
12685
13490
|
// Note terminating 0 in decoded string
|
12686
13491
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
12687
13492
|
const auto & code_points = decoded.first;
|
13493
|
+
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
12688
13494
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
12689
|
-
|
13495
|
+
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
13496
|
+
grammar->stacks = tmp_new_stacks;
|
12690
13497
|
}
|
12691
13498
|
grammar->partial_utf8 = decoded.second;
|
12692
13499
|
GGML_ASSERT(!grammar->stacks.empty());
|
@@ -12820,6 +13627,11 @@ struct llama_beam_search_data {
|
|
12820
13627
|
}
|
12821
13628
|
llama_logit_info logit_info(ctx);
|
12822
13629
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
13630
|
+
|
13631
|
+
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
13632
|
+
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
13633
|
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
13634
|
+
|
12823
13635
|
size_t i=0;
|
12824
13636
|
if (next_beams.size() < n_beams) {
|
12825
13637
|
for (; next_beams.size() < n_beams ; ++i) {
|
@@ -13318,9 +14130,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
13318
14130
|
return new_type;
|
13319
14131
|
}
|
13320
14132
|
|
13321
|
-
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const
|
14133
|
+
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
13322
14134
|
std::mutex mutex;
|
13323
|
-
|
14135
|
+
int64_t counter = 0;
|
13324
14136
|
size_t new_size = 0;
|
13325
14137
|
if (nthread < 2) {
|
13326
14138
|
// single-thread
|
@@ -13328,11 +14140,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13328
14140
|
}
|
13329
14141
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
13330
14142
|
nrows, n_per_row, imatrix]() {
|
13331
|
-
const
|
14143
|
+
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
13332
14144
|
size_t local_size = 0;
|
13333
14145
|
while (true) {
|
13334
14146
|
std::unique_lock<std::mutex> lock(mutex);
|
13335
|
-
|
14147
|
+
int64_t first_row = counter; counter += nrows_per_chunk;
|
13336
14148
|
if (first_row >= nrows) {
|
13337
14149
|
if (local_size > 0) {
|
13338
14150
|
new_size += local_size;
|
@@ -13340,7 +14152,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|
13340
14152
|
break;
|
13341
14153
|
}
|
13342
14154
|
lock.unlock();
|
13343
|
-
const
|
14155
|
+
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
13344
14156
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
13345
14157
|
}
|
13346
14158
|
};
|
@@ -13440,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13440
14252
|
gguf_set_kv (ctx_out, ml.meta);
|
13441
14253
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
13442
14254
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
14255
|
+
// Remove split metadata
|
14256
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
14257
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
14258
|
+
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
|
13443
14259
|
|
13444
14260
|
if (params->kv_overrides) {
|
13445
14261
|
const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
|
@@ -13463,7 +14279,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13463
14279
|
const std::string name = ggml_get_name(meta);
|
13464
14280
|
|
13465
14281
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
13466
|
-
if (name.find("attn_v.weight")
|
14282
|
+
if (name.find("attn_v.weight") != std::string::npos ||
|
14283
|
+
name.find("attn_qkv.weight") != std::string::npos) {
|
13467
14284
|
++qs.n_attention_wv;
|
13468
14285
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
13469
14286
|
qs.has_output = true;
|
@@ -13473,7 +14290,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13473
14290
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
13474
14291
|
|
13475
14292
|
// sanity checks
|
13476
|
-
|
14293
|
+
//
|
14294
|
+
// - qs.n_attention_wv == 0 for Mamba models
|
14295
|
+
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
14296
|
+
//
|
14297
|
+
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
13477
14298
|
|
13478
14299
|
size_t total_size_org = 0;
|
13479
14300
|
size_t total_size_new = 0;
|
@@ -13529,6 +14350,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13529
14350
|
|
13530
14351
|
// quantize only 2D and 3D tensors (experts)
|
13531
14352
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
14353
|
+
|
14354
|
+
// do not quantize norm tensors
|
14355
|
+
quantize &= name.find("_norm.weight") == std::string::npos;
|
14356
|
+
|
13532
14357
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
13533
14358
|
quantize &= !params->only_copy;
|
13534
14359
|
|
@@ -13557,10 +14382,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13557
14382
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
13558
14383
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
13559
14384
|
}
|
13560
|
-
|
14385
|
+
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
13561
14386
|
new_type = params->token_embedding_type;
|
13562
14387
|
}
|
13563
|
-
|
14388
|
+
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
13564
14389
|
new_type = params->output_tensor_type;
|
13565
14390
|
}
|
13566
14391
|
|
@@ -13575,7 +14400,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13575
14400
|
new_size = ggml_nbytes(tensor);
|
13576
14401
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
13577
14402
|
} else {
|
13578
|
-
const
|
14403
|
+
const int64_t nelements = ggml_nelements(tensor);
|
13579
14404
|
|
13580
14405
|
const float * imatrix = nullptr;
|
13581
14406
|
if (imatrix_data) {
|
@@ -13627,20 +14452,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
13627
14452
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
13628
14453
|
fflush(stdout);
|
13629
14454
|
|
13630
|
-
if (work.size() < nelements * 4) {
|
14455
|
+
if (work.size() < (size_t)nelements * 4) {
|
13631
14456
|
work.resize(nelements * 4); // upper bound on size
|
13632
14457
|
}
|
13633
14458
|
new_data = work.data();
|
13634
14459
|
|
13635
|
-
const
|
13636
|
-
const
|
14460
|
+
const int64_t n_per_row = tensor->ne[0];
|
14461
|
+
const int64_t nrows = tensor->ne[1];
|
13637
14462
|
|
13638
|
-
static const
|
13639
|
-
const
|
14463
|
+
static const int64_t min_chunk_size = 32 * 512;
|
14464
|
+
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
13640
14465
|
|
13641
|
-
const
|
13642
|
-
const
|
13643
|
-
const
|
14466
|
+
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
14467
|
+
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
14468
|
+
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
13644
14469
|
|
13645
14470
|
// quantize each expert separately since they have different importance matrices
|
13646
14471
|
new_size = 0;
|
@@ -14525,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
14525
15350
|
case LLM_ARCH_MINICPM:
|
14526
15351
|
case LLM_ARCH_XVERSE:
|
14527
15352
|
case LLM_ARCH_COMMAND_R:
|
15353
|
+
case LLM_ARCH_OLMO:
|
14528
15354
|
return LLAMA_ROPE_TYPE_NORM;
|
14529
15355
|
|
14530
15356
|
// the pairs of head values are offset by n_rot/2
|
14531
15357
|
case LLM_ARCH_FALCON:
|
14532
15358
|
case LLM_ARCH_GROK:
|
15359
|
+
case LLM_ARCH_DBRX:
|
14533
15360
|
case LLM_ARCH_PERSIMMON:
|
14534
15361
|
case LLM_ARCH_BERT:
|
14535
15362
|
case LLM_ARCH_NOMIC_BERT:
|
14536
15363
|
case LLM_ARCH_STABLELM:
|
14537
15364
|
case LLM_ARCH_QWEN:
|
14538
15365
|
case LLM_ARCH_QWEN2:
|
15366
|
+
case LLM_ARCH_QWEN2MOE:
|
14539
15367
|
case LLM_ARCH_PHI2:
|
14540
15368
|
case LLM_ARCH_GEMMA:
|
14541
15369
|
case LLM_ARCH_STARCODER2:
|
@@ -14905,9 +15733,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
14905
15733
|
llama_kv_cache_update_internal(*ctx);
|
14906
15734
|
}
|
14907
15735
|
|
15736
|
+
// deprecated
|
15737
|
+
size_t llama_get_state_size(const struct llama_context * ctx) {
|
15738
|
+
return llama_state_get_size(ctx);
|
15739
|
+
}
|
15740
|
+
|
15741
|
+
// deprecated
|
15742
|
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
15743
|
+
return llama_state_get_data(ctx, dst);
|
15744
|
+
}
|
15745
|
+
|
15746
|
+
// deprecated
|
15747
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
15748
|
+
return llama_state_set_data(ctx, src);
|
15749
|
+
}
|
15750
|
+
|
15751
|
+
// deprecated
|
15752
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15753
|
+
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15754
|
+
}
|
15755
|
+
|
15756
|
+
// deprecated
|
15757
|
+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15758
|
+
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
15759
|
+
}
|
14908
15760
|
|
14909
15761
|
// Returns the *maximum* size of the state
|
14910
|
-
size_t
|
15762
|
+
size_t llama_state_get_size(const struct llama_context * ctx) {
|
14911
15763
|
const auto & cparams = ctx->cparams;
|
14912
15764
|
const auto & hparams = ctx->model.hparams;
|
14913
15765
|
|
@@ -14995,15 +15847,15 @@ struct llama_data_file_context : llama_data_context {
|
|
14995
15847
|
* file context:
|
14996
15848
|
* llama_file file("/path", "wb");
|
14997
15849
|
* llama_data_file_context data_ctx(&file);
|
14998
|
-
*
|
15850
|
+
* llama_state_get_data(ctx, &data_ctx);
|
14999
15851
|
*
|
15000
15852
|
* buffer context:
|
15001
15853
|
* std::vector<uint8_t> buf(max_size, 0);
|
15002
15854
|
* llama_data_buffer_context data_ctx(&buf.data());
|
15003
|
-
*
|
15855
|
+
* llama_state_get_data(ctx, &data_ctx);
|
15004
15856
|
*
|
15005
15857
|
*/
|
15006
|
-
static void
|
15858
|
+
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
15007
15859
|
// copy rng
|
15008
15860
|
{
|
15009
15861
|
std::ostringstream rng_ss;
|
@@ -15147,15 +15999,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
15147
15999
|
}
|
15148
16000
|
}
|
15149
16001
|
|
15150
|
-
size_t
|
16002
|
+
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
15151
16003
|
llama_data_buffer_context data_ctx(dst);
|
15152
|
-
|
16004
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15153
16005
|
|
15154
16006
|
return data_ctx.get_size_written();
|
15155
16007
|
}
|
15156
16008
|
|
15157
16009
|
// Sets the state reading from the specified source address
|
15158
|
-
size_t
|
16010
|
+
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
15159
16011
|
const uint8_t * inp = src;
|
15160
16012
|
|
15161
16013
|
// set rng
|
@@ -15192,6 +16044,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15192
16044
|
GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
|
15193
16045
|
ctx->output_ids[id] = i;
|
15194
16046
|
}
|
16047
|
+
|
16048
|
+
ctx->n_outputs = n_outputs;
|
15195
16049
|
}
|
15196
16050
|
}
|
15197
16051
|
|
@@ -15307,14 +16161,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|
15307
16161
|
}
|
15308
16162
|
|
15309
16163
|
const size_t nread = inp - src;
|
15310
|
-
const size_t max_size =
|
16164
|
+
const size_t max_size = llama_state_get_size(ctx);
|
15311
16165
|
|
15312
16166
|
GGML_ASSERT(nread <= max_size);
|
15313
16167
|
|
15314
16168
|
return nread;
|
15315
16169
|
}
|
15316
16170
|
|
15317
|
-
static bool
|
16171
|
+
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15318
16172
|
llama_file file(path_session, "rb");
|
15319
16173
|
|
15320
16174
|
// sanity checks
|
@@ -15352,7 +16206,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15352
16206
|
// restore the context state
|
15353
16207
|
{
|
15354
16208
|
const size_t n_state_size_cur = file.size - file.tell();
|
15355
|
-
const size_t n_state_size_max =
|
16209
|
+
const size_t n_state_size_max = llama_state_get_size(ctx);
|
15356
16210
|
|
15357
16211
|
if (n_state_size_cur > n_state_size_max) {
|
15358
16212
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
@@ -15362,22 +16216,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
|
15362
16216
|
std::vector<uint8_t> state_data(n_state_size_max);
|
15363
16217
|
file.read_raw(state_data.data(), n_state_size_cur);
|
15364
16218
|
|
15365
|
-
|
16219
|
+
llama_state_set_data(ctx, state_data.data());
|
15366
16220
|
}
|
15367
16221
|
|
15368
16222
|
return true;
|
15369
16223
|
}
|
15370
16224
|
|
15371
|
-
bool
|
16225
|
+
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
15372
16226
|
try {
|
15373
|
-
return
|
16227
|
+
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
15374
16228
|
} catch (const std::exception & err) {
|
15375
16229
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
15376
16230
|
return false;
|
15377
16231
|
}
|
15378
16232
|
}
|
15379
16233
|
|
15380
|
-
bool
|
16234
|
+
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
15381
16235
|
llama_file file(path_session, "wb");
|
15382
16236
|
|
15383
16237
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
@@ -15391,11 +16245,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
15391
16245
|
|
15392
16246
|
// save the context state using stream saving
|
15393
16247
|
llama_data_file_context data_ctx(&file);
|
15394
|
-
|
16248
|
+
llama_state_get_data_internal(ctx, &data_ctx);
|
15395
16249
|
|
15396
16250
|
return true;
|
15397
16251
|
}
|
15398
16252
|
|
16253
|
+
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
16254
|
+
try {
|
16255
|
+
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
16256
|
+
} catch (const std::exception & err) {
|
16257
|
+
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
16258
|
+
return false;
|
16259
|
+
}
|
16260
|
+
}
|
16261
|
+
|
16262
|
+
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
16263
|
+
// save the size of size_t as a uint32_t for safety check
|
16264
|
+
const size_t size_t_size_size = sizeof(uint32_t);
|
16265
|
+
|
16266
|
+
// other values
|
16267
|
+
const size_t s_cell_count_size = sizeof(uint32_t);
|
16268
|
+
const size_t s_layer_count_size = sizeof(uint32_t);
|
16269
|
+
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
16270
|
+
|
16271
|
+
size_t s_cell_count = 0;
|
16272
|
+
size_t s_cell_data_size = 0;
|
16273
|
+
const auto & kv_self = ctx->kv_self;
|
16274
|
+
const auto & hparams = ctx->model.hparams;
|
16275
|
+
|
16276
|
+
const uint32_t n_layer = hparams.n_layer;
|
16277
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16278
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16279
|
+
|
16280
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
16281
|
+
const auto & cell = kv_self.cells[i];
|
16282
|
+
if (cell.seq_id.count(seq_id) > 0) {
|
16283
|
+
++s_cell_count;
|
16284
|
+
s_cell_data_size += sizeof(llama_pos);
|
16285
|
+
}
|
16286
|
+
}
|
16287
|
+
|
16288
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16289
|
+
// types of keys and values
|
16290
|
+
s_cell_data_size += sizeof(int32_t) * 2;
|
16291
|
+
// k_size_row and v_size_el values of layer
|
16292
|
+
s_cell_data_size += sizeof(size_t) * 2;
|
16293
|
+
|
16294
|
+
// keys
|
16295
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16296
|
+
s_cell_data_size += k_size_row * s_cell_count;
|
16297
|
+
|
16298
|
+
// values (transposed)
|
16299
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16300
|
+
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
16301
|
+
}
|
16302
|
+
|
16303
|
+
const size_t s_total = (
|
16304
|
+
size_t_size_size +
|
16305
|
+
s_cell_count_size +
|
16306
|
+
s_layer_count_size +
|
16307
|
+
n_embd_v_gqa_size +
|
16308
|
+
s_cell_data_size
|
16309
|
+
);
|
16310
|
+
|
16311
|
+
return s_total;
|
16312
|
+
}
|
16313
|
+
|
16314
|
+
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
16315
|
+
const auto & kv_self = ctx->kv_self;
|
16316
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16317
|
+
|
16318
|
+
// Save the size of size_t as a uint32_t for safety check
|
16319
|
+
const uint32_t size_t_size = sizeof(size_t);
|
16320
|
+
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
16321
|
+
|
16322
|
+
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
16323
|
+
uint32_t cell_count = 0;
|
16324
|
+
|
16325
|
+
// Count the number of cells with the specified seq_id
|
16326
|
+
// Find all the ranges of cells with this seq id
|
16327
|
+
{
|
16328
|
+
uint32_t cell_range_begin = kv_self.size;
|
16329
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
16330
|
+
const auto & cell = kv_self.cells[i];
|
16331
|
+
if (cell.has_seq_id(seq_id)) {
|
16332
|
+
++cell_count;
|
16333
|
+
if (cell_range_begin == kv_self.size) {
|
16334
|
+
cell_range_begin = i;
|
16335
|
+
}
|
16336
|
+
}
|
16337
|
+
else {
|
16338
|
+
if (cell_range_begin != kv_self.size) {
|
16339
|
+
cell_ranges.push_back({ cell_range_begin, i });
|
16340
|
+
cell_range_begin = kv_self.size;
|
16341
|
+
}
|
16342
|
+
}
|
16343
|
+
}
|
16344
|
+
if (cell_range_begin != kv_self.size) {
|
16345
|
+
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
16346
|
+
}
|
16347
|
+
|
16348
|
+
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
16349
|
+
uint32_t cell_count_check = 0;
|
16350
|
+
for (const auto & range : cell_ranges) {
|
16351
|
+
cell_count_check += range.second - range.first;
|
16352
|
+
}
|
16353
|
+
GGML_ASSERT(cell_count == cell_count_check);
|
16354
|
+
}
|
16355
|
+
|
16356
|
+
// Write the cell count
|
16357
|
+
data_ctx.write(&cell_count, sizeof(cell_count));
|
16358
|
+
|
16359
|
+
const auto & hparams = ctx->model.hparams;
|
16360
|
+
const uint32_t n_layer = hparams.n_layer;
|
16361
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16362
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16363
|
+
|
16364
|
+
// Write the layer count
|
16365
|
+
data_ctx.write(&n_layer, sizeof(n_layer));
|
16366
|
+
|
16367
|
+
// Write n_embd_v_gqa
|
16368
|
+
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
16369
|
+
|
16370
|
+
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
16371
|
+
for (const auto & range : cell_ranges) {
|
16372
|
+
for (uint32_t i = range.first; i < range.second; ++i) {
|
16373
|
+
const auto & cell = kv_self.cells[i];
|
16374
|
+
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
16375
|
+
}
|
16376
|
+
}
|
16377
|
+
|
16378
|
+
// Iterate and write all the keys first, each row is a cell
|
16379
|
+
// Get whole range at a time
|
16380
|
+
std::vector<uint8_t> tmp_buf;
|
16381
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16382
|
+
// Write key type
|
16383
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
16384
|
+
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
16385
|
+
|
16386
|
+
// Write row size of key
|
16387
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16388
|
+
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
16389
|
+
|
16390
|
+
// Read each range of cells of k_size length each into tmp_buf and write out
|
16391
|
+
for (const auto & range : cell_ranges) {
|
16392
|
+
const size_t range_size = range.second - range.first;
|
16393
|
+
tmp_buf.resize(range_size * k_size_row);
|
16394
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
16395
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16396
|
+
}
|
16397
|
+
}
|
16398
|
+
|
16399
|
+
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
16400
|
+
const uint32_t kv_size = kv_self.size;
|
16401
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16402
|
+
// Write value type
|
16403
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16404
|
+
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
16405
|
+
|
16406
|
+
// Write element size
|
16407
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16408
|
+
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
16409
|
+
|
16410
|
+
// For each row, we get the element values of each cell
|
16411
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16412
|
+
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
16413
|
+
for (const auto & range : cell_ranges) {
|
16414
|
+
const size_t range_size = range.second - range.first;
|
16415
|
+
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
16416
|
+
tmp_buf.resize(range_size * v_size_el);
|
16417
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
16418
|
+
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
16419
|
+
}
|
16420
|
+
}
|
16421
|
+
}
|
16422
|
+
|
16423
|
+
return data_ctx.get_size_written();
|
16424
|
+
}
|
16425
|
+
|
16426
|
+
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
16427
|
+
llama_data_buffer_context data_ctx(dst);
|
16428
|
+
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
16429
|
+
}
|
16430
|
+
|
16431
|
+
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
16432
|
+
auto & kv_self = ctx->kv_self;
|
16433
|
+
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
16434
|
+
|
16435
|
+
// Wipe the slot
|
16436
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16437
|
+
|
16438
|
+
const uint8_t * inp = src;
|
16439
|
+
|
16440
|
+
// Read size of size_t
|
16441
|
+
uint32_t size_t_size;
|
16442
|
+
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
16443
|
+
inp += sizeof(size_t_size);
|
16444
|
+
if (size_t_size != sizeof(size_t)) {
|
16445
|
+
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
16446
|
+
return 0;
|
16447
|
+
}
|
16448
|
+
|
16449
|
+
// Read the cell count
|
16450
|
+
uint32_t cell_count;
|
16451
|
+
memcpy(&cell_count, inp, sizeof(cell_count));
|
16452
|
+
inp += sizeof(cell_count);
|
16453
|
+
|
16454
|
+
// Read the layer count
|
16455
|
+
uint32_t n_layer_ref;
|
16456
|
+
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
16457
|
+
inp += sizeof(n_layer_ref);
|
16458
|
+
|
16459
|
+
// Read n_embd_v_gqa
|
16460
|
+
uint32_t n_embd_v_gqa_ref;
|
16461
|
+
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
16462
|
+
inp += sizeof(n_embd_v_gqa_ref);
|
16463
|
+
|
16464
|
+
// Sanity check model compatibility
|
16465
|
+
const auto & hparams = ctx->model.hparams;
|
16466
|
+
const uint32_t n_layer = hparams.n_layer;
|
16467
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
16468
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
16469
|
+
if (n_layer != n_layer_ref) {
|
16470
|
+
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
16471
|
+
return 0;
|
16472
|
+
}
|
16473
|
+
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
16474
|
+
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
16475
|
+
return 0;
|
16476
|
+
}
|
16477
|
+
|
16478
|
+
// Allocate the new cells for the slot
|
16479
|
+
if (cell_count) {
|
16480
|
+
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
16481
|
+
batch.n_tokens = cell_count;
|
16482
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
16483
|
+
llama_pos pos;
|
16484
|
+
memcpy(&pos, inp, sizeof(pos));
|
16485
|
+
inp += sizeof(pos);
|
16486
|
+
|
16487
|
+
batch.pos[i] = pos;
|
16488
|
+
batch.n_seq_id[i] = 1;
|
16489
|
+
batch.seq_id[i][0] = dest_seq_id;
|
16490
|
+
}
|
16491
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
16492
|
+
llama_batch_free(batch);
|
16493
|
+
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
16494
|
+
return 0;
|
16495
|
+
}
|
16496
|
+
|
16497
|
+
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
16498
|
+
// Assume that this is one contiguous block of cells
|
16499
|
+
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
16500
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
16501
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
16502
|
+
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
16503
|
+
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
16504
|
+
|
16505
|
+
// Cleanup
|
16506
|
+
llama_batch_free(batch);
|
16507
|
+
}
|
16508
|
+
|
16509
|
+
const uint32_t kv_size = kv_self.size;
|
16510
|
+
const uint32_t kv_head = kv_self.head;
|
16511
|
+
|
16512
|
+
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
16513
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16514
|
+
// Read type of key
|
16515
|
+
int32_t k_type_i_ref;
|
16516
|
+
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
16517
|
+
inp += sizeof(k_type_i_ref);
|
16518
|
+
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
16519
|
+
if (k_type_i != k_type_i_ref) {
|
16520
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16521
|
+
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
16522
|
+
return 0;
|
16523
|
+
}
|
16524
|
+
|
16525
|
+
// Read row size of key
|
16526
|
+
size_t k_size_row_ref;
|
16527
|
+
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
16528
|
+
inp += sizeof(k_size_row_ref);
|
16529
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
16530
|
+
if (k_size_row != k_size_row_ref) {
|
16531
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16532
|
+
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
16533
|
+
return 0;
|
16534
|
+
}
|
16535
|
+
|
16536
|
+
if (cell_count) {
|
16537
|
+
// Read and set the keys for the whole cell range
|
16538
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
16539
|
+
inp += cell_count * k_size_row;
|
16540
|
+
}
|
16541
|
+
}
|
16542
|
+
|
16543
|
+
// For each layer, read the values for each cell (transposed)
|
16544
|
+
for (int il = 0; il < (int)n_layer; ++il) {
|
16545
|
+
// Read type of value
|
16546
|
+
int32_t v_type_i_ref;
|
16547
|
+
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
16548
|
+
inp += sizeof(v_type_i_ref);
|
16549
|
+
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
16550
|
+
if (v_type_i != v_type_i_ref) {
|
16551
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16552
|
+
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
16553
|
+
return 0;
|
16554
|
+
}
|
16555
|
+
|
16556
|
+
// Read element size of value
|
16557
|
+
size_t v_size_el_ref;
|
16558
|
+
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
16559
|
+
inp += sizeof(v_size_el_ref);
|
16560
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
16561
|
+
if (v_size_el != v_size_el_ref) {
|
16562
|
+
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
16563
|
+
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
16564
|
+
return 0;
|
16565
|
+
}
|
16566
|
+
|
16567
|
+
if (cell_count) {
|
16568
|
+
// For each row in the transposed matrix, read the values for the whole cell range
|
16569
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
16570
|
+
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
16571
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
16572
|
+
inp += cell_count * v_size_el;
|
16573
|
+
}
|
16574
|
+
}
|
16575
|
+
}
|
16576
|
+
|
16577
|
+
const size_t nread = inp - src;
|
16578
|
+
return nread;
|
16579
|
+
}
|
16580
|
+
|
16581
|
+
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
16582
|
+
llama_file file(filepath, "wb");
|
16583
|
+
|
16584
|
+
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
16585
|
+
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
16586
|
+
|
16587
|
+
// save the prompt
|
16588
|
+
file.write_u32((uint32_t)n_token_count);
|
16589
|
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
16590
|
+
|
16591
|
+
// save the context state using stream saving
|
16592
|
+
llama_data_file_context data_ctx(&file);
|
16593
|
+
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
16594
|
+
|
16595
|
+
const size_t res = file.tell();
|
16596
|
+
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
16597
|
+
return res;
|
16598
|
+
}
|
16599
|
+
|
16600
|
+
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
16601
|
+
llama_file file(filepath, "rb");
|
16602
|
+
|
16603
|
+
// version checks
|
16604
|
+
{
|
16605
|
+
const uint32_t magic = file.read_u32();
|
16606
|
+
const uint32_t version = file.read_u32();
|
16607
|
+
|
16608
|
+
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
16609
|
+
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
16610
|
+
return 0;
|
16611
|
+
}
|
16612
|
+
}
|
16613
|
+
|
16614
|
+
// load the prompt
|
16615
|
+
{
|
16616
|
+
const uint32_t n_token_count = file.read_u32();
|
16617
|
+
|
16618
|
+
if (n_token_count > n_token_capacity) {
|
16619
|
+
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
16620
|
+
return 0;
|
16621
|
+
}
|
16622
|
+
|
16623
|
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
16624
|
+
*n_token_count_out = n_token_count;
|
16625
|
+
}
|
16626
|
+
|
16627
|
+
// restore the context state
|
16628
|
+
{
|
16629
|
+
const size_t state_size = file.size - file.tell();
|
16630
|
+
std::vector<uint8_t> state_data(state_size);
|
16631
|
+
file.read_raw(state_data.data(), state_size);
|
16632
|
+
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
16633
|
+
if (!nread) {
|
16634
|
+
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
16635
|
+
return 0;
|
16636
|
+
}
|
16637
|
+
GGML_ASSERT(nread <= state_size);
|
16638
|
+
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
16639
|
+
}
|
16640
|
+
|
16641
|
+
return file.tell();
|
16642
|
+
}
|
16643
|
+
|
16644
|
+
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
16645
|
+
try {
|
16646
|
+
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
16647
|
+
} catch (const std::exception & err) {
|
16648
|
+
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
16649
|
+
return 0;
|
16650
|
+
}
|
16651
|
+
}
|
16652
|
+
|
16653
|
+
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
16654
|
+
try {
|
16655
|
+
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
16656
|
+
} catch (const std::exception & err) {
|
16657
|
+
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
16658
|
+
return 0;
|
16659
|
+
}
|
16660
|
+
}
|
16661
|
+
|
15399
16662
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
15400
16663
|
ctx->cparams.n_threads = n_threads;
|
15401
16664
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
@@ -15509,23 +16772,31 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|
15509
16772
|
}
|
15510
16773
|
|
15511
16774
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
16775
|
+
int32_t j = -1;
|
15512
16776
|
llama_synchronize(ctx);
|
15513
16777
|
|
15514
16778
|
try {
|
15515
16779
|
if (ctx->logits == nullptr) {
|
15516
16780
|
throw std::runtime_error("no logits");
|
15517
16781
|
}
|
15518
|
-
|
16782
|
+
|
16783
|
+
if (i < 0) {
|
16784
|
+
j = ctx->n_outputs + i;
|
16785
|
+
if (j < 0) {
|
16786
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16787
|
+
}
|
16788
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15519
16789
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16790
|
+
} else {
|
16791
|
+
j = ctx->output_ids[i];
|
15520
16792
|
}
|
15521
|
-
const int32_t j = ctx->output_ids[i];
|
15522
16793
|
|
15523
16794
|
if (j < 0) {
|
15524
16795
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15525
16796
|
}
|
15526
|
-
if (
|
16797
|
+
if (j >= ctx->n_outputs) {
|
15527
16798
|
// This should not happen
|
15528
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16799
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15529
16800
|
}
|
15530
16801
|
|
15531
16802
|
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
@@ -15545,23 +16816,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
15545
16816
|
}
|
15546
16817
|
|
15547
16818
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
16819
|
+
int32_t j = -1;
|
16820
|
+
|
15548
16821
|
llama_synchronize(ctx);
|
15549
16822
|
|
15550
16823
|
try {
|
15551
16824
|
if (ctx->embd == nullptr) {
|
15552
16825
|
throw std::runtime_error("no embeddings");
|
15553
16826
|
}
|
15554
|
-
|
16827
|
+
|
16828
|
+
if (i < 0) {
|
16829
|
+
j = ctx->n_outputs + i;
|
16830
|
+
if (j < 0) {
|
16831
|
+
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
16832
|
+
}
|
16833
|
+
} else if ((size_t) i >= ctx->output_ids.size()) {
|
15555
16834
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
16835
|
+
} else {
|
16836
|
+
j = ctx->output_ids[i];
|
15556
16837
|
}
|
15557
|
-
const int32_t j = ctx->output_ids[i];
|
15558
16838
|
|
15559
16839
|
if (j < 0) {
|
15560
16840
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
15561
16841
|
}
|
15562
|
-
if (
|
16842
|
+
if (j >= ctx->n_outputs) {
|
15563
16843
|
// This should not happen
|
15564
|
-
throw std::runtime_error(format("corrupt output buffer (j=%d,
|
16844
|
+
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
15565
16845
|
}
|
15566
16846
|
|
15567
16847
|
return ctx->embd + j*ctx->model.hparams.n_embd;
|
@@ -15608,6 +16888,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
|
15608
16888
|
return model->vocab.special_eos_id;
|
15609
16889
|
}
|
15610
16890
|
|
16891
|
+
llama_token llama_token_cls(const struct llama_model * model) {
|
16892
|
+
return model->vocab.special_cls_id;
|
16893
|
+
}
|
16894
|
+
|
16895
|
+
llama_token llama_token_sep(const struct llama_model * model) {
|
16896
|
+
return model->vocab.special_sep_id;
|
16897
|
+
}
|
16898
|
+
|
15611
16899
|
llama_token llama_token_nl(const struct llama_model * model) {
|
15612
16900
|
return model->vocab.linefeed_id;
|
15613
16901
|
}
|
@@ -15642,9 +16930,9 @@ int32_t llama_tokenize(
|
|
15642
16930
|
int32_t text_len,
|
15643
16931
|
llama_token * tokens,
|
15644
16932
|
int32_t n_tokens_max,
|
15645
|
-
bool
|
15646
|
-
bool
|
15647
|
-
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len),
|
16933
|
+
bool add_special,
|
16934
|
+
bool parse_special) {
|
16935
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
|
15648
16936
|
|
15649
16937
|
if (n_tokens_max < (int) res.size()) {
|
15650
16938
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -15910,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
|
|
15910
17198
|
if (add_ass) {
|
15911
17199
|
ss << "### Response:\n";
|
15912
17200
|
}
|
17201
|
+
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
17202
|
+
// CohereForAI/c4ai-command-r-plus
|
17203
|
+
for (auto message : chat) {
|
17204
|
+
std::string role(message->role);
|
17205
|
+
if (role == "system") {
|
17206
|
+
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17207
|
+
} else if (role == "user") {
|
17208
|
+
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17209
|
+
} else if (role == "assistant") {
|
17210
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
|
17211
|
+
}
|
17212
|
+
}
|
17213
|
+
if (add_ass) {
|
17214
|
+
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
17215
|
+
}
|
15913
17216
|
} else {
|
15914
17217
|
// template not supported
|
15915
17218
|
return -1;
|