llama_cpp 0.15.3 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +4 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
- data/vendor/tmp/llama.cpp/ggml.c +145 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -3
- data/vendor/tmp/llama.cpp/llama.cpp +637 -249
- data/vendor/tmp/llama.cpp/llama.h +11 -5
- metadata +2 -2
@@ -103,7 +103,7 @@
|
|
103
103
|
#endif
|
104
104
|
|
105
105
|
#define LLAMA_MAX_NODES 8192
|
106
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 160
|
107
107
|
|
108
108
|
//
|
109
109
|
// logging
|
@@ -222,6 +222,7 @@ enum llm_arch {
|
|
222
222
|
LLM_ARCH_DBRX,
|
223
223
|
LLM_ARCH_OLMO,
|
224
224
|
LLM_ARCH_ARCTIC,
|
225
|
+
LLM_ARCH_DEEPSEEK2,
|
225
226
|
LLM_ARCH_UNKNOWN,
|
226
227
|
};
|
227
228
|
|
@@ -259,6 +260,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
259
260
|
{ LLM_ARCH_DBRX, "dbrx" },
|
260
261
|
{ LLM_ARCH_OLMO, "olmo" },
|
261
262
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
263
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
262
264
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
263
265
|
};
|
264
266
|
|
@@ -279,11 +281,15 @@ enum llm_kv {
|
|
279
281
|
LLM_KV_CONTEXT_LENGTH,
|
280
282
|
LLM_KV_EMBEDDING_LENGTH,
|
281
283
|
LLM_KV_BLOCK_COUNT,
|
284
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
282
285
|
LLM_KV_FEED_FORWARD_LENGTH,
|
286
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
283
287
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
284
288
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
285
289
|
LLM_KV_EXPERT_COUNT,
|
286
290
|
LLM_KV_EXPERT_USED_COUNT,
|
291
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
292
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
287
293
|
LLM_KV_POOLING_TYPE,
|
288
294
|
LLM_KV_LOGIT_SCALE,
|
289
295
|
|
@@ -296,6 +302,8 @@ enum llm_kv {
|
|
296
302
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
297
303
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
298
304
|
LLM_KV_ATTENTION_CAUSAL,
|
305
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
306
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
299
307
|
|
300
308
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
301
309
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -305,6 +313,7 @@ enum llm_kv {
|
|
305
313
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
306
314
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
307
315
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
316
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
308
317
|
|
309
318
|
LLM_KV_SPLIT_NO,
|
310
319
|
LLM_KV_SPLIT_COUNT,
|
@@ -353,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
353
362
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
354
363
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
355
364
|
|
356
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
357
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
358
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
359
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
360
|
-
{
|
361
|
-
{
|
362
|
-
{
|
363
|
-
{
|
364
|
-
{
|
365
|
-
{
|
366
|
-
{
|
365
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
366
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
367
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
368
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
369
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
370
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
371
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
372
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
373
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
374
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
375
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
376
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
377
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
378
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
379
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
367
380
|
|
368
381
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
369
382
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -374,6 +387,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
374
387
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
375
388
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
376
389
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
390
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
391
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
377
392
|
|
378
393
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
379
394
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -383,6 +398,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
383
398
|
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
384
399
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
385
400
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
401
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
386
402
|
|
387
403
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
388
404
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
@@ -474,6 +490,12 @@ enum llm_tensor {
|
|
474
490
|
LLM_TENSOR_SSM_A,
|
475
491
|
LLM_TENSOR_SSM_D,
|
476
492
|
LLM_TENSOR_SSM_OUT,
|
493
|
+
LLM_TENSOR_ATTN_Q_A,
|
494
|
+
LLM_TENSOR_ATTN_Q_B,
|
495
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
496
|
+
LLM_TENSOR_ATTN_KV_B,
|
497
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
498
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
477
499
|
};
|
478
500
|
|
479
501
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -1057,6 +1079,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1057
1079
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
1080
|
},
|
1059
1081
|
},
|
1082
|
+
{
|
1083
|
+
LLM_ARCH_DEEPSEEK2,
|
1084
|
+
{
|
1085
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1086
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1087
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1088
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1089
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1090
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1091
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1092
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1093
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1094
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1095
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1096
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1097
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1098
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1099
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1100
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1101
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1102
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1103
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1104
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1105
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1106
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1107
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1108
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1109
|
+
},
|
1110
|
+
},
|
1060
1111
|
{
|
1061
1112
|
LLM_ARCH_UNKNOWN,
|
1062
1113
|
{
|
@@ -1651,12 +1702,13 @@ struct llama_mlock {
|
|
1651
1702
|
};
|
1652
1703
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1653
1704
|
|
1654
|
-
|
1705
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
1706
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
1655
1707
|
std::vector<char> result(8, 0);
|
1656
|
-
const int n_tokens = llama_token_to_piece(
|
1708
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1657
1709
|
if (n_tokens < 0) {
|
1658
1710
|
result.resize(-n_tokens);
|
1659
|
-
int check = llama_token_to_piece(
|
1711
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1660
1712
|
GGML_ASSERT(check == -n_tokens);
|
1661
1713
|
}
|
1662
1714
|
else {
|
@@ -1741,6 +1793,7 @@ enum e_model {
|
|
1741
1793
|
MODEL_13B,
|
1742
1794
|
MODEL_14B,
|
1743
1795
|
MODEL_15B,
|
1796
|
+
MODEL_16B,
|
1744
1797
|
MODEL_20B,
|
1745
1798
|
MODEL_30B,
|
1746
1799
|
MODEL_34B,
|
@@ -1748,6 +1801,7 @@ enum e_model {
|
|
1748
1801
|
MODEL_40B,
|
1749
1802
|
MODEL_65B,
|
1750
1803
|
MODEL_70B,
|
1804
|
+
MODEL_236B,
|
1751
1805
|
MODEL_314B,
|
1752
1806
|
MODEL_SMALL,
|
1753
1807
|
MODEL_MEDIUM,
|
@@ -1783,6 +1837,13 @@ struct llama_hparams {
|
|
1783
1837
|
uint32_t n_expert_used = 0;
|
1784
1838
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1785
1839
|
|
1840
|
+
uint32_t n_layer_dense_lead = 0;
|
1841
|
+
uint32_t n_lora_q = 0;
|
1842
|
+
uint32_t n_lora_kv = 0;
|
1843
|
+
uint32_t n_ff_exp = 0;
|
1844
|
+
uint32_t n_expert_shared = 0;
|
1845
|
+
float expert_weights_scale = 0.0;
|
1846
|
+
|
1786
1847
|
float f_norm_eps;
|
1787
1848
|
float f_norm_rms_eps;
|
1788
1849
|
|
@@ -1790,6 +1851,7 @@ struct llama_hparams {
|
|
1790
1851
|
float rope_freq_base_train;
|
1791
1852
|
float rope_freq_scale_train;
|
1792
1853
|
uint32_t n_yarn_orig_ctx;
|
1854
|
+
float rope_yarn_log_mul;
|
1793
1855
|
|
1794
1856
|
// for State Space Models
|
1795
1857
|
uint32_t ssm_d_conv = 0;
|
@@ -1823,6 +1885,12 @@ struct llama_hparams {
|
|
1823
1885
|
if (this->n_expert != other.n_expert) return true;
|
1824
1886
|
if (this->n_expert_used != other.n_expert_used) return true;
|
1825
1887
|
|
1888
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
1889
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
1890
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1891
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
1892
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1893
|
+
|
1826
1894
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1827
1895
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1828
1896
|
|
@@ -1838,6 +1906,8 @@ struct llama_hparams {
|
|
1838
1906
|
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1839
1907
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1840
1908
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1909
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
1910
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
1841
1911
|
|
1842
1912
|
return false;
|
1843
1913
|
}
|
@@ -1913,6 +1983,8 @@ struct llama_layer {
|
|
1913
1983
|
struct ggml_tensor * attn_k_norm_b;
|
1914
1984
|
struct ggml_tensor * attn_out_norm;
|
1915
1985
|
struct ggml_tensor * attn_out_norm_b;
|
1986
|
+
struct ggml_tensor * attn_q_a_norm;
|
1987
|
+
struct ggml_tensor * attn_kv_a_norm;
|
1916
1988
|
|
1917
1989
|
// attention
|
1918
1990
|
struct ggml_tensor * wq;
|
@@ -1920,6 +1992,10 @@ struct llama_layer {
|
|
1920
1992
|
struct ggml_tensor * wv;
|
1921
1993
|
struct ggml_tensor * wo;
|
1922
1994
|
struct ggml_tensor * wqkv;
|
1995
|
+
struct ggml_tensor * wq_a;
|
1996
|
+
struct ggml_tensor * wq_b;
|
1997
|
+
struct ggml_tensor * wkv_a_mqa;
|
1998
|
+
struct ggml_tensor * wkv_b;
|
1923
1999
|
|
1924
2000
|
// attention bias
|
1925
2001
|
struct ggml_tensor * bq;
|
@@ -1953,8 +2029,9 @@ struct llama_layer {
|
|
1953
2029
|
struct ggml_tensor * ffn_up_shexp;
|
1954
2030
|
|
1955
2031
|
// ff bias
|
1956
|
-
struct ggml_tensor *
|
1957
|
-
struct ggml_tensor *
|
2032
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
2033
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
2034
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
1958
2035
|
struct ggml_tensor * ffn_act;
|
1959
2036
|
|
1960
2037
|
// mamba proj
|
@@ -2086,7 +2163,9 @@ struct llama_vocab {
|
|
2086
2163
|
std::unordered_map<token, id> token_to_id;
|
2087
2164
|
std::vector<token_data> id_to_token;
|
2088
2165
|
|
2089
|
-
std::
|
2166
|
+
std::vector<id> cache_special_tokens;
|
2167
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
2168
|
+
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
2090
2169
|
|
2091
2170
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2092
2171
|
|
@@ -3832,6 +3911,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3832
3911
|
case MODEL_13B: return "13B";
|
3833
3912
|
case MODEL_14B: return "14B";
|
3834
3913
|
case MODEL_15B: return "15B";
|
3914
|
+
case MODEL_16B: return "16B";
|
3835
3915
|
case MODEL_20B: return "20B";
|
3836
3916
|
case MODEL_30B: return "30B";
|
3837
3917
|
case MODEL_34B: return "34B";
|
@@ -3839,6 +3919,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3839
3919
|
case MODEL_40B: return "40B";
|
3840
3920
|
case MODEL_65B: return "65B";
|
3841
3921
|
case MODEL_70B: return "70B";
|
3922
|
+
case MODEL_236B: return "236B";
|
3842
3923
|
case MODEL_314B: return "314B";
|
3843
3924
|
case MODEL_SMALL: return "0.1B";
|
3844
3925
|
case MODEL_MEDIUM: return "0.4B";
|
@@ -3981,7 +4062,9 @@ static void llm_load_hparams(
|
|
3981
4062
|
switch (hparams.n_layer) {
|
3982
4063
|
case 22: model.type = e_model::MODEL_1B; break;
|
3983
4064
|
case 26: model.type = e_model::MODEL_3B; break;
|
3984
|
-
|
4065
|
+
// granite uses a vocab with len 49152
|
4066
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
4067
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
3985
4068
|
case 40: model.type = e_model::MODEL_13B; break;
|
3986
4069
|
case 48: model.type = e_model::MODEL_34B; break;
|
3987
4070
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -4251,6 +4334,8 @@ static void llm_load_hparams(
|
|
4251
4334
|
case 30: model.type = e_model::MODEL_3B; break;
|
4252
4335
|
case 32: model.type = e_model::MODEL_7B; break;
|
4253
4336
|
case 40: model.type = e_model::MODEL_15B; break;
|
4337
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
4338
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
4254
4339
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4255
4340
|
}
|
4256
4341
|
} break;
|
@@ -4384,6 +4469,26 @@ static void llm_load_hparams(
|
|
4384
4469
|
model.type = e_model::MODEL_UNKNOWN;
|
4385
4470
|
}
|
4386
4471
|
} break;
|
4472
|
+
case LLM_ARCH_DEEPSEEK2:
|
4473
|
+
{
|
4474
|
+
bool is_lite = (hparams.n_layer == 27);
|
4475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
4477
|
+
if (!is_lite) {
|
4478
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
4479
|
+
}
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
4481
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
4482
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
4484
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
4485
|
+
|
4486
|
+
switch (hparams.n_layer) {
|
4487
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
4488
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
4489
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4490
|
+
}
|
4491
|
+
} break;
|
4387
4492
|
default: (void)0;
|
4388
4493
|
}
|
4389
4494
|
|
@@ -4490,15 +4595,14 @@ static void llm_load_vocab(
|
|
4490
4595
|
vocab.special_cls_id = 101;
|
4491
4596
|
vocab.special_mask_id = 103;
|
4492
4597
|
vocab.add_space_prefix = false;
|
4493
|
-
} else {
|
4494
|
-
|
4495
|
-
|
4496
|
-
|
4497
|
-
|
4498
|
-
|
4499
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4500
|
-
return;
|
4598
|
+
} else if (tokenizer_model == "gpt2") {
|
4599
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4600
|
+
|
4601
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4602
|
+
if (add_space_prefix_keyidx != -1) {
|
4603
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4501
4604
|
}
|
4605
|
+
|
4502
4606
|
// read bpe merges and populate bpe ranks
|
4503
4607
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4504
4608
|
if (merges_keyidx == -1) {
|
@@ -4532,6 +4636,8 @@ static void llm_load_vocab(
|
|
4532
4636
|
vocab.special_pad_id = -1;
|
4533
4637
|
vocab.special_cls_id = -1;
|
4534
4638
|
vocab.special_mask_id = -1;
|
4639
|
+
} else {
|
4640
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
4535
4641
|
}
|
4536
4642
|
|
4537
4643
|
// for now, only BPE models have pre-tokenizers
|
@@ -4593,6 +4699,9 @@ static void llm_load_vocab(
|
|
4593
4699
|
} else if (
|
4594
4700
|
tokenizer_pre == "dbrx") {
|
4595
4701
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4702
|
+
} else if (
|
4703
|
+
tokenizer_pre == "smaug-bpe") {
|
4704
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4596
4705
|
} else {
|
4597
4706
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4598
4707
|
}
|
@@ -4721,97 +4830,40 @@ static void llm_load_vocab(
|
|
4721
4830
|
|
4722
4831
|
// build special tokens cache
|
4723
4832
|
{
|
4724
|
-
|
4725
|
-
// and will always be correctly labeled in 'added_tokens.json' etc.
|
4726
|
-
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
4727
|
-
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
4728
|
-
// are special tokens.
|
4729
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
4730
|
-
//
|
4731
|
-
|
4732
|
-
// Counting special tokens and verifying in only one direction
|
4733
|
-
// is sufficient to detect difference in those two sets.
|
4734
|
-
//
|
4735
|
-
uint32_t special_tokens_count_by_type = 0;
|
4736
|
-
uint32_t special_tokens_count_from_verification = 0;
|
4737
|
-
|
4738
|
-
bool special_tokens_definition_mismatch = false;
|
4739
|
-
|
4740
|
-
for (const auto & t : vocab.token_to_id) {
|
4741
|
-
const auto & token = t.first;
|
4742
|
-
const auto & id = t.second;
|
4743
|
-
|
4744
|
-
// Count all non-normal tokens in the vocab while iterating
|
4833
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4745
4834
|
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
4746
|
-
|
4835
|
+
vocab.cache_special_tokens.push_back(id);
|
4747
4836
|
}
|
4837
|
+
}
|
4748
4838
|
|
4749
|
-
|
4750
|
-
|
4751
|
-
|
4752
|
-
|
4753
|
-
|
4754
|
-
// and check if both halves can be matched to a valid token
|
4755
|
-
for (unsigned i = 1; i < token.length();) {
|
4756
|
-
const auto left = token.substr(0, i);
|
4757
|
-
const auto right = token.substr(i);
|
4758
|
-
|
4759
|
-
// check if we didnt partition in the middle of a utf sequence
|
4760
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
4761
|
-
|
4762
|
-
if (utf == 1) {
|
4763
|
-
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
4764
|
-
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
4765
|
-
is_tokenizable = true;
|
4766
|
-
break;
|
4767
|
-
}
|
4768
|
-
i++;
|
4769
|
-
} else {
|
4770
|
-
// skip over the rest of multibyte utf sequence
|
4771
|
-
i += utf - 1;
|
4772
|
-
}
|
4773
|
-
}
|
4839
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
4840
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
4841
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
4842
|
+
}
|
4843
|
+
);
|
4774
4844
|
|
4775
|
-
|
4776
|
-
|
4777
|
-
// it's faster to re-filter them here, since there are way less candidates now
|
4845
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4846
|
+
}
|
4778
4847
|
|
4779
|
-
|
4780
|
-
|
4781
|
-
|
4782
|
-
utf8_str_len++;
|
4783
|
-
i += utf8_len(token.at(i));
|
4784
|
-
}
|
4848
|
+
// build token to piece caches
|
4849
|
+
{
|
4850
|
+
size_t size_cache = 0;
|
4785
4851
|
|
4786
|
-
|
4787
|
-
|
4788
|
-
// At this point what we have left are special tokens only
|
4789
|
-
vocab.special_tokens_cache[token] = id;
|
4852
|
+
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
4853
|
+
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
4790
4854
|
|
4791
|
-
|
4792
|
-
|
4855
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4856
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
4857
|
+
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
4793
4858
|
|
4794
|
-
|
4795
|
-
|
4796
|
-
special_tokens_definition_mismatch = true;
|
4797
|
-
}
|
4798
|
-
}
|
4799
|
-
}
|
4800
|
-
}
|
4859
|
+
size_cache += cache_token_to_piece[id].size();
|
4860
|
+
size_cache += cache_token_to_piece_special[id].size();
|
4801
4861
|
}
|
4802
4862
|
|
4803
|
-
|
4804
|
-
|
4805
|
-
|
4806
|
-
|
4807
|
-
special_tokens_count_by_type, vocab.id_to_token.size()
|
4808
|
-
);
|
4809
|
-
} else {
|
4810
|
-
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
4811
|
-
__func__,
|
4812
|
-
special_tokens_count_from_verification, vocab.id_to_token.size()
|
4813
|
-
);
|
4814
|
-
}
|
4863
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4864
|
+
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
4865
|
+
|
4866
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4815
4867
|
}
|
4816
4868
|
}
|
4817
4869
|
|
@@ -4892,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4892
4944
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4893
4945
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4894
4946
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4947
|
+
|
4948
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
4949
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
4950
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
4951
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
4952
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4953
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
4954
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
4955
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
4956
|
+
}
|
4895
4957
|
}
|
4896
4958
|
|
4897
4959
|
// Returns false if cancelled by progress_callback
|
@@ -5048,8 +5110,6 @@ static bool llm_load_tensors(
|
|
5048
5110
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
5049
5111
|
}
|
5050
5112
|
|
5051
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
5052
|
-
|
5053
5113
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
5054
5114
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
5055
5115
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
@@ -5103,6 +5163,11 @@ static bool llm_load_tensors(
|
|
5103
5163
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5104
5164
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5105
5165
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5166
|
+
|
5167
|
+
// optional MLP bias
|
5168
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5169
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5106
5171
|
} else {
|
5107
5172
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5108
5173
|
|
@@ -6210,6 +6275,70 @@ static bool llm_load_tensors(
|
|
6210
6275
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
6276
|
}
|
6212
6277
|
} break;
|
6278
|
+
case LLM_ARCH_DEEPSEEK2:
|
6279
|
+
{
|
6280
|
+
bool is_lite = (hparams.n_layer == 27);
|
6281
|
+
|
6282
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
6283
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6284
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
6285
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
6286
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
6287
|
+
|
6288
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6289
|
+
|
6290
|
+
// output
|
6291
|
+
{
|
6292
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6293
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6294
|
+
}
|
6295
|
+
|
6296
|
+
for (int i = 0; i < n_layer; ++i) {
|
6297
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6298
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6299
|
+
|
6300
|
+
auto & layer = model.layers[i];
|
6301
|
+
|
6302
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6303
|
+
if (!is_lite) {
|
6304
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
6305
|
+
}
|
6306
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
6307
|
+
|
6308
|
+
if (!is_lite) {
|
6309
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
6310
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
6311
|
+
} else {
|
6312
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
6313
|
+
}
|
6314
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
6315
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
6316
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
6317
|
+
|
6318
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6319
|
+
|
6320
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
6321
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
6322
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
6323
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6324
|
+
} else {
|
6325
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6326
|
+
|
6327
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
6328
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
6329
|
+
|
6330
|
+
// MoE branch
|
6331
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6332
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
6333
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6334
|
+
|
6335
|
+
// Shared expert branch
|
6336
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6337
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
6338
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6339
|
+
}
|
6340
|
+
}
|
6341
|
+
} break;
|
6213
6342
|
default:
|
6214
6343
|
throw std::runtime_error("unknown architecture");
|
6215
6344
|
}
|
@@ -6664,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6664
6793
|
int64_t n_expert_used,
|
6665
6794
|
llm_ffn_op_type type_op,
|
6666
6795
|
bool norm_w,
|
6796
|
+
bool scale_w,
|
6797
|
+
float w_scale,
|
6667
6798
|
const llm_build_cb & cb,
|
6668
6799
|
int il) {
|
6669
6800
|
int64_t n_embd = cur->ne[0];
|
@@ -6695,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6695
6826
|
|
6696
6827
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6697
6828
|
}
|
6829
|
+
if (scale_w) {
|
6830
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
6831
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
6832
|
+
}
|
6698
6833
|
|
6699
6834
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6700
6835
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
@@ -7305,9 +7440,9 @@ struct llm_build_context {
|
|
7305
7440
|
cb(cur, "ffn_norm", il);
|
7306
7441
|
|
7307
7442
|
cur = llm_build_ffn(ctx0, cur,
|
7308
|
-
model.layers[il].ffn_up,
|
7309
|
-
model.layers[il].ffn_gate,
|
7310
|
-
model.layers[il].ffn_down,
|
7443
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7444
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
7445
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7311
7446
|
NULL,
|
7312
7447
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7313
7448
|
cb(cur, "ffn_out", il);
|
@@ -7325,6 +7460,7 @@ struct llm_build_context {
|
|
7325
7460
|
model.layers[il].ffn_down_exps,
|
7326
7461
|
n_expert, n_expert_used,
|
7327
7462
|
LLM_FFN_SILU, true,
|
7463
|
+
false, 0.0,
|
7328
7464
|
cb, il);
|
7329
7465
|
cb(cur, "ffn_moe_out", il);
|
7330
7466
|
}
|
@@ -7806,6 +7942,7 @@ struct llm_build_context {
|
|
7806
7942
|
model.layers[il].ffn_down_exps,
|
7807
7943
|
n_expert, n_expert_used,
|
7808
7944
|
LLM_FFN_GELU, true,
|
7945
|
+
false, 0.0,
|
7809
7946
|
cb, il);
|
7810
7947
|
cb(cur, "ffn_moe_out", il);
|
7811
7948
|
|
@@ -7949,6 +8086,7 @@ struct llm_build_context {
|
|
7949
8086
|
model.layers[il].ffn_down_exps,
|
7950
8087
|
n_expert, n_expert_used,
|
7951
8088
|
LLM_FFN_SILU, true,
|
8089
|
+
false, 0.0,
|
7952
8090
|
cb, il);
|
7953
8091
|
cb(cur, "ffn_moe_out", il);
|
7954
8092
|
|
@@ -9087,6 +9225,7 @@ struct llm_build_context {
|
|
9087
9225
|
model.layers[il].ffn_down_exps,
|
9088
9226
|
n_expert, n_expert_used,
|
9089
9227
|
LLM_FFN_SILU, false,
|
9228
|
+
false, 0.0,
|
9090
9229
|
cb, il);
|
9091
9230
|
cb(cur, "ffn_moe_out", il);
|
9092
9231
|
|
@@ -10974,6 +11113,7 @@ struct llm_build_context {
|
|
10974
11113
|
model.layers[il].ffn_down_exps,
|
10975
11114
|
n_expert, n_expert_used,
|
10976
11115
|
LLM_FFN_SILU, true,
|
11116
|
+
false, 0.0,
|
10977
11117
|
cb, il);
|
10978
11118
|
cb(cur, "ffn_moe_out", il);
|
10979
11119
|
|
@@ -11005,6 +11145,239 @@ struct llm_build_context {
|
|
11005
11145
|
|
11006
11146
|
return gf;
|
11007
11147
|
}
|
11148
|
+
|
11149
|
+
struct ggml_cgraph * build_deepseek2() {
|
11150
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11151
|
+
|
11152
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11153
|
+
int32_t n_tokens = this->n_tokens;
|
11154
|
+
|
11155
|
+
bool is_lite = (hparams.n_layer == 27);
|
11156
|
+
|
11157
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
11158
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
11159
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
11160
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
11161
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
11162
|
+
|
11163
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
11164
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
11165
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
11166
|
+
|
11167
|
+
struct ggml_tensor * cur;
|
11168
|
+
struct ggml_tensor * inpL;
|
11169
|
+
|
11170
|
+
// {n_embd, n_tokens}
|
11171
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11172
|
+
|
11173
|
+
// inp_pos - contains the positions
|
11174
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11175
|
+
|
11176
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11177
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11178
|
+
|
11179
|
+
for (int il = 0; il < n_layer; ++il) {
|
11180
|
+
struct ggml_tensor * inpSA = inpL;
|
11181
|
+
|
11182
|
+
// norm
|
11183
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11184
|
+
model.layers[il].attn_norm, NULL,
|
11185
|
+
LLM_NORM_RMS, cb, il);
|
11186
|
+
cb(cur, "attn_norm", il);
|
11187
|
+
|
11188
|
+
// self_attention
|
11189
|
+
{
|
11190
|
+
struct ggml_tensor * q = NULL;
|
11191
|
+
if (!is_lite) {
|
11192
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
11193
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
11194
|
+
cb(q, "q", il);
|
11195
|
+
|
11196
|
+
q = llm_build_norm(ctx0, q, hparams,
|
11197
|
+
model.layers[il].attn_q_a_norm, NULL,
|
11198
|
+
LLM_NORM_RMS, cb, il);
|
11199
|
+
cb(q, "q", il);
|
11200
|
+
|
11201
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
11202
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
11203
|
+
cb(q, "q", il);
|
11204
|
+
} else {
|
11205
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
11206
|
+
cb(q, "q", il);
|
11207
|
+
}
|
11208
|
+
|
11209
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11210
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
11211
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11212
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11213
|
+
0);
|
11214
|
+
cb(q_nope, "q_nope", il);
|
11215
|
+
|
11216
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
11217
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
11218
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11219
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11220
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
11221
|
+
cb(q_pe, "q_pe", il);
|
11222
|
+
|
11223
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
11224
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
11225
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
11226
|
+
|
11227
|
+
// split into {kv_lora_rank, n_tokens}
|
11228
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
11229
|
+
kv_pe_compresseed->nb[1],
|
11230
|
+
0);
|
11231
|
+
cb(kv_compressed, "kv_compressed", il);
|
11232
|
+
|
11233
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
11234
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
11235
|
+
kv_pe_compresseed->nb[1],
|
11236
|
+
kv_pe_compresseed->nb[1],
|
11237
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
11238
|
+
cb(k_pe, "k_pe", il);
|
11239
|
+
|
11240
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
11241
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
11242
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
11243
|
+
LLM_NORM_RMS, cb, il);
|
11244
|
+
cb(kv_compressed, "kv_compressed", il);
|
11245
|
+
|
11246
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
11247
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
11248
|
+
cb(kv, "kv", il);
|
11249
|
+
|
11250
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11251
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
11252
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
11253
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11254
|
+
0);
|
11255
|
+
cb(k_nope, "k_nope", il);
|
11256
|
+
|
11257
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
11258
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
11259
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11260
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
11261
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
11262
|
+
cb(v_states, "v_states", il);
|
11263
|
+
|
11264
|
+
v_states = ggml_cont(ctx0, v_states);
|
11265
|
+
cb(v_states, "v_states", il);
|
11266
|
+
|
11267
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
11268
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
11269
|
+
0);
|
11270
|
+
cb(v_states, "v_states", il);
|
11271
|
+
|
11272
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11273
|
+
q_pe = ggml_rope_ext(
|
11274
|
+
ctx0, q_pe, inp_pos, nullptr,
|
11275
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11276
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11277
|
+
);
|
11278
|
+
cb(q_pe, "q_pe", il);
|
11279
|
+
|
11280
|
+
// shared RoPE key
|
11281
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11282
|
+
k_pe = ggml_rope_ext(
|
11283
|
+
ctx0, k_pe, inp_pos, nullptr,
|
11284
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11285
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11286
|
+
);
|
11287
|
+
cb(k_pe, "k_pe", il);
|
11288
|
+
|
11289
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
11290
|
+
cb(q_states, "q_states", il);
|
11291
|
+
|
11292
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
11293
|
+
cb(k_states, "k_states", il);
|
11294
|
+
|
11295
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
11296
|
+
model.layers[il].wo, NULL,
|
11297
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11298
|
+
}
|
11299
|
+
|
11300
|
+
if (il == n_layer - 1) {
|
11301
|
+
// skip computing output for unused tokens
|
11302
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11303
|
+
n_tokens = n_outputs;
|
11304
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
11305
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11306
|
+
}
|
11307
|
+
|
11308
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
11309
|
+
cb(ffn_inp, "ffn_inp", il);
|
11310
|
+
|
11311
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
11312
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11313
|
+
model.layers[il].ffn_norm, NULL,
|
11314
|
+
LLM_NORM_RMS, cb, il);
|
11315
|
+
cb(cur, "ffn_norm", il);
|
11316
|
+
|
11317
|
+
cur = llm_build_ffn(ctx0, cur,
|
11318
|
+
model.layers[il].ffn_up, NULL,
|
11319
|
+
model.layers[il].ffn_gate, NULL,
|
11320
|
+
model.layers[il].ffn_down, NULL,
|
11321
|
+
NULL,
|
11322
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11323
|
+
cb(cur, "ffn_out", il);
|
11324
|
+
} else {
|
11325
|
+
// MoE branch
|
11326
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11327
|
+
model.layers[il].ffn_norm, NULL,
|
11328
|
+
LLM_NORM_RMS, cb, il);
|
11329
|
+
cb(cur, "ffn_norm", il);
|
11330
|
+
|
11331
|
+
ggml_tensor * moe_out =
|
11332
|
+
llm_build_moe_ffn(ctx0, cur,
|
11333
|
+
model.layers[il].ffn_gate_inp,
|
11334
|
+
model.layers[il].ffn_up_exps,
|
11335
|
+
model.layers[il].ffn_gate_exps,
|
11336
|
+
model.layers[il].ffn_down_exps,
|
11337
|
+
n_expert, n_expert_used,
|
11338
|
+
LLM_FFN_SILU, false,
|
11339
|
+
true, hparams.expert_weights_scale,
|
11340
|
+
cb, il);
|
11341
|
+
cb(moe_out, "ffn_moe_out", il);
|
11342
|
+
|
11343
|
+
// FFN shared expert
|
11344
|
+
{
|
11345
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
11346
|
+
model.layers[il].ffn_up_shexp, NULL,
|
11347
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
11348
|
+
model.layers[il].ffn_down_shexp, NULL,
|
11349
|
+
NULL,
|
11350
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11351
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
11352
|
+
|
11353
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
11354
|
+
cb(cur, "ffn_out", il);
|
11355
|
+
}
|
11356
|
+
}
|
11357
|
+
|
11358
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
11359
|
+
cb(cur, "l_out", il);
|
11360
|
+
|
11361
|
+
// input for next layer
|
11362
|
+
inpL = cur;
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
cur = inpL;
|
11366
|
+
|
11367
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11368
|
+
model.output_norm, NULL,
|
11369
|
+
LLM_NORM_RMS, cb, -1);
|
11370
|
+
cb(cur, "result_norm", -1);
|
11371
|
+
|
11372
|
+
// lm_head
|
11373
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11374
|
+
cb(cur, "result_output", -1);
|
11375
|
+
|
11376
|
+
ggml_build_forward_expand(gf, cur);
|
11377
|
+
|
11378
|
+
return gf;
|
11379
|
+
}
|
11380
|
+
|
11008
11381
|
};
|
11009
11382
|
|
11010
11383
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -11223,6 +11596,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11223
11596
|
{
|
11224
11597
|
result = llm.build_arctic();
|
11225
11598
|
} break;
|
11599
|
+
case LLM_ARCH_DEEPSEEK2:
|
11600
|
+
{
|
11601
|
+
result = llm.build_deepseek2();
|
11602
|
+
} break;
|
11226
11603
|
default:
|
11227
11604
|
GGML_ASSERT(false);
|
11228
11605
|
}
|
@@ -12512,6 +12889,7 @@ struct llm_tokenizer_bpe {
|
|
12512
12889
|
});
|
12513
12890
|
break;
|
12514
12891
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12892
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
12515
12893
|
word_collection = unicode_regex_split(text, {
|
12516
12894
|
// same as llama3
|
12517
12895
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12734,7 +13112,7 @@ struct llm_tokenizer_wpm {
|
|
12734
13112
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
12735
13113
|
|
12736
13114
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12737
|
-
auto
|
13115
|
+
const auto & token_map = vocab.token_to_id;
|
12738
13116
|
|
12739
13117
|
// normalize and split by whitespace
|
12740
13118
|
std::vector<std::string> words = preprocess(text);
|
@@ -12749,108 +13127,89 @@ struct llm_tokenizer_wpm {
|
|
12749
13127
|
}
|
12750
13128
|
|
12751
13129
|
// prepend phantom space
|
12752
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
12753
|
-
int n = word1.size();
|
13130
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
13131
|
+
const int n = word1.size();
|
12754
13132
|
|
12755
|
-
|
12756
|
-
int i = 0;
|
12757
|
-
bool match_any = false;
|
13133
|
+
const size_t current_tokens = output.size();
|
12758
13134
|
|
13135
|
+
// we're at the start of a new word
|
12759
13136
|
// move through character position in word
|
12760
|
-
|
13137
|
+
for (int i = 0; i < n; ++i) {
|
12761
13138
|
// loop through possible match length
|
12762
13139
|
bool match = false;
|
12763
13140
|
for (int j = n; j > i; j--) {
|
12764
|
-
auto it = token_map
|
12765
|
-
if (it != token_map
|
13141
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
13142
|
+
if (it != token_map.end()) {
|
12766
13143
|
output.push_back(it->second);
|
12767
13144
|
match = true;
|
12768
|
-
|
12769
|
-
i = j;
|
13145
|
+
i = j - 1;
|
12770
13146
|
break;
|
12771
13147
|
}
|
12772
13148
|
}
|
12773
13149
|
|
12774
|
-
|
12775
|
-
|
12776
|
-
|
13150
|
+
if (!match) { // discard all
|
13151
|
+
output.resize(current_tokens);
|
13152
|
+
break; // and discard next tokens
|
12777
13153
|
}
|
12778
13154
|
}
|
12779
13155
|
|
12780
13156
|
// we didn't find any matches for this word
|
12781
|
-
if (
|
13157
|
+
if (current_tokens == output.size()) {
|
12782
13158
|
output.push_back(vocab.special_unk_id);
|
12783
13159
|
}
|
12784
13160
|
}
|
12785
13161
|
}
|
12786
13162
|
|
12787
13163
|
std::vector<std::string> preprocess(const std::string & text) {
|
12788
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
12789
|
-
|
12790
|
-
|
12791
|
-
|
12792
|
-
|
12793
|
-
|
12794
|
-
|
12795
|
-
|
13164
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13165
|
+
std::vector<std::string> words(1, "");
|
13166
|
+
|
13167
|
+
for (const char32_t cpt : cpts_nfd) {
|
13168
|
+
const auto flags = unicode_cpt_flags(cpt);
|
13169
|
+
|
13170
|
+
if (flags.is_whitespace) {
|
13171
|
+
if (words.back().size()) { // finish previous word if any
|
13172
|
+
words.emplace_back();
|
13173
|
+
}
|
12796
13174
|
continue;
|
12797
13175
|
}
|
12798
|
-
|
12799
|
-
|
12800
|
-
|
12801
|
-
|
12802
|
-
std::string s = unicode_cpt_to_utf8(code);
|
12803
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12804
|
-
new_str += " ";
|
12805
|
-
new_str += s;
|
12806
|
-
new_str += " ";
|
12807
|
-
} else {
|
12808
|
-
new_str += s;
|
13176
|
+
|
13177
|
+
assert (!flags.is_separator);
|
13178
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
13179
|
+
continue;
|
12809
13180
|
}
|
12810
|
-
}
|
12811
13181
|
|
12812
|
-
|
12813
|
-
|
12814
|
-
|
12815
|
-
|
12816
|
-
|
12817
|
-
|
12818
|
-
|
12819
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
12820
|
-
l = r + 1;
|
12821
|
-
r = l;
|
13182
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
13183
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
13184
|
+
if (words.back().size()) { // finish previous word if any
|
13185
|
+
words.emplace_back();
|
13186
|
+
}
|
13187
|
+
words.back() = s; // single char word
|
13188
|
+
words.emplace_back(); // start a new word
|
12822
13189
|
} else {
|
12823
|
-
|
13190
|
+
words.back() += s; // append char to word
|
12824
13191
|
}
|
12825
13192
|
}
|
12826
|
-
if (r > l) {
|
12827
|
-
words.push_back(new_str.substr(l, (r - l)));
|
12828
|
-
}
|
12829
|
-
return words;
|
12830
|
-
}
|
12831
13193
|
|
12832
|
-
|
12833
|
-
|
12834
|
-
return false;
|
13194
|
+
if (!words.back().size()) {
|
13195
|
+
words.pop_back();
|
12835
13196
|
}
|
12836
|
-
|
12837
|
-
return
|
13197
|
+
|
13198
|
+
return words;
|
12838
13199
|
}
|
12839
13200
|
|
12840
|
-
bool is_chinese_char(uint32_t cpt) {
|
12841
|
-
|
12842
|
-
(cpt >=
|
13201
|
+
static bool is_chinese_char(uint32_t cpt) {
|
13202
|
+
return
|
13203
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
13204
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
12843
13205
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
12844
13206
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
12845
13207
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
12846
13208
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
12847
|
-
(cpt >=
|
12848
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
12849
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
12850
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
12851
|
-
return true; // NOLINT
|
12852
|
-
}
|
12853
|
-
return false;
|
13209
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
13210
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
13211
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
13212
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
12854
13213
|
}
|
12855
13214
|
|
12856
13215
|
const llama_vocab & vocab;
|
@@ -12894,9 +13253,8 @@ struct fragment_buffer_variant {
|
|
12894
13253
|
|
12895
13254
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
12896
13255
|
// for each special token
|
12897
|
-
for (const
|
12898
|
-
const auto & special_token =
|
12899
|
-
const auto & special_id = st.second;
|
13256
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13257
|
+
const auto & special_token = vocab.id_to_token[special_id].text;
|
12900
13258
|
|
12901
13259
|
// for each text fragment
|
12902
13260
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -12905,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12905
13263
|
|
12906
13264
|
// if a fragment is text ( not yet processed )
|
12907
13265
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
12908
|
-
auto
|
13266
|
+
auto & raw_text = fragment.raw_text;
|
12909
13267
|
|
12910
13268
|
auto raw_text_base_offset = fragment.offset;
|
12911
13269
|
auto raw_text_base_length = fragment.length;
|
@@ -12915,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12915
13273
|
// find the first occurrence of a given special token in this fragment
|
12916
13274
|
// passing offset argument only limit the "search area" but match coordinates
|
12917
13275
|
// are still relative to the source full raw_text
|
12918
|
-
auto match = raw_text
|
13276
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
12919
13277
|
|
12920
13278
|
// no occurrences found, stop processing this fragment for a given special token
|
12921
13279
|
if (match == std::string::npos) break;
|
@@ -12934,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12934
13292
|
// left
|
12935
13293
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
12936
13294
|
const int64_t left_reminder_length = match - raw_text_base_offset;
|
12937
|
-
buffer.emplace_after(it,
|
13295
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
12938
13296
|
|
12939
13297
|
#ifdef PRETOKENIZERDEBUG
|
12940
13298
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
@@ -12950,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12950
13308
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
12951
13309
|
const int64_t right_reminder_offset = match + special_token.length();
|
12952
13310
|
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
12953
|
-
buffer.emplace_after(it,
|
13311
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
12954
13312
|
|
12955
13313
|
#ifdef PRETOKENIZERDEBUG
|
12956
13314
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
@@ -14054,7 +14412,7 @@ void llama_sample_repetition_penalties(
|
|
14054
14412
|
|
14055
14413
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
14056
14414
|
GGML_ASSERT(ctx);
|
14057
|
-
|
14415
|
+
int64_t t_start_sample_us = ggml_time_us();
|
14058
14416
|
|
14059
14417
|
bool allow_eog = false;
|
14060
14418
|
for (const auto & stack : grammar->stacks) {
|
@@ -14066,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
14066
14424
|
|
14067
14425
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
14068
14426
|
candidates_decoded.reserve(candidates->size);
|
14069
|
-
|
14427
|
+
|
14428
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
14070
14429
|
candidates_grammar.reserve(candidates->size);
|
14071
14430
|
|
14072
14431
|
for (size_t i = 0; i < candidates->size; ++i) {
|
14073
|
-
const llama_token id
|
14074
|
-
const std::string piece =
|
14432
|
+
const llama_token id = candidates->data[i].id;
|
14433
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
14075
14434
|
|
14076
14435
|
if (llama_token_is_eog(&ctx->model, id)) {
|
14077
14436
|
if (!allow_eog) {
|
@@ -14271,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14271
14630
|
GGML_ASSERT(false);
|
14272
14631
|
}
|
14273
14632
|
|
14274
|
-
const std::string piece =
|
14633
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
14275
14634
|
|
14276
14635
|
// Note terminating 0 in decoded string
|
14277
14636
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -16235,6 +16594,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16235
16594
|
case LLM_ARCH_COMMAND_R:
|
16236
16595
|
case LLM_ARCH_OLMO:
|
16237
16596
|
case LLM_ARCH_ARCTIC:
|
16597
|
+
case LLM_ARCH_DEEPSEEK2:
|
16238
16598
|
return LLAMA_ROPE_TYPE_NORM;
|
16239
16599
|
|
16240
16600
|
// the pairs of head values are offset by n_rot/2
|
@@ -17861,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
17861
18221
|
);
|
17862
18222
|
}
|
17863
18223
|
|
18224
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
18225
|
+
return llama_is_control_token(model->vocab, token);
|
18226
|
+
}
|
18227
|
+
|
17864
18228
|
llama_token llama_token_bos(const struct llama_model * model) {
|
17865
18229
|
return model->vocab.special_bos_id;
|
17866
18230
|
}
|
@@ -17932,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17932
18296
|
|
17933
18297
|
const auto cpts = unicode_cpts_from_utf8(text);
|
17934
18298
|
for (const auto cpt : cpts) {
|
17935
|
-
|
18299
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
18300
|
+
try {
|
18301
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
18302
|
+
} catch (const std::out_of_range & e) {
|
18303
|
+
decoded_text += "[UNK_BYTE_0x";
|
18304
|
+
for (const auto c : utf8) {
|
18305
|
+
decoded_text += format("%02x", (uint8_t) c);
|
18306
|
+
}
|
18307
|
+
decoded_text += text + "]";
|
18308
|
+
}
|
17936
18309
|
}
|
17937
18310
|
|
17938
18311
|
return decoded_text;
|
@@ -17940,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17940
18313
|
|
17941
18314
|
// does not write null-terminator to buf
|
17942
18315
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18316
|
+
// if we have a cache - use it
|
18317
|
+
{
|
18318
|
+
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
18319
|
+
|
18320
|
+
if (!cache.empty()) {
|
18321
|
+
const auto & res = cache.at(token);
|
18322
|
+
if (length < (int) res.size()) {
|
18323
|
+
return -(int) res.size();
|
18324
|
+
}
|
18325
|
+
memcpy(buf, res.c_str(), res.size());
|
18326
|
+
return res.size();
|
18327
|
+
}
|
18328
|
+
}
|
18329
|
+
|
17943
18330
|
if (0 <= token && token < llama_n_vocab(model)) {
|
17944
18331
|
switch (llama_vocab_get_type(model->vocab)) {
|
17945
|
-
|
17946
|
-
|
17947
|
-
|
17948
|
-
|
17949
|
-
|
17950
|
-
|
17951
|
-
|
17952
|
-
|
17953
|
-
|
17954
|
-
|
17955
|
-
|
17956
|
-
|
17957
|
-
|
17958
|
-
|
17959
|
-
|
17960
|
-
|
17961
|
-
|
17962
|
-
|
17963
|
-
|
17964
|
-
|
17965
|
-
|
17966
|
-
|
17967
|
-
|
17968
|
-
|
17969
|
-
|
17970
|
-
|
17971
|
-
|
17972
|
-
|
17973
|
-
|
17974
|
-
|
18332
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
18333
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
18334
|
+
// NOTE: we accept all unsupported token types,
|
18335
|
+
// suppressing them like CONTROL tokens.
|
18336
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18337
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18338
|
+
llama_unescape_whitespace(result);
|
18339
|
+
if (length < (int) result.length()) {
|
18340
|
+
return -(int) result.length();
|
18341
|
+
}
|
18342
|
+
memcpy(buf, result.c_str(), result.length());
|
18343
|
+
return result.length();
|
18344
|
+
} else if (
|
18345
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18346
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18347
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18348
|
+
if (length < (int) result.length()) {
|
18349
|
+
return -(int) result.length();
|
18350
|
+
}
|
18351
|
+
memcpy(buf, result.c_str(), result.length());
|
18352
|
+
return result.length();
|
18353
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
18354
|
+
if (length < 3) {
|
18355
|
+
return -3;
|
18356
|
+
}
|
18357
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
18358
|
+
return 3;
|
18359
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
18360
|
+
if (length < 1) {
|
18361
|
+
return -1;
|
18362
|
+
}
|
18363
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
18364
|
+
return 1;
|
17975
18365
|
}
|
17976
|
-
|
17977
|
-
return 1;
|
18366
|
+
break;
|
17978
18367
|
}
|
17979
|
-
|
17980
|
-
|
17981
|
-
|
17982
|
-
|
17983
|
-
|
17984
|
-
|
17985
|
-
|
17986
|
-
|
17987
|
-
|
17988
|
-
|
17989
|
-
|
17990
|
-
|
17991
|
-
|
17992
|
-
|
17993
|
-
|
17994
|
-
(
|
17995
|
-
|
17996
|
-
|
17997
|
-
|
18368
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
18369
|
+
// NOTE: we accept all unsupported token types,
|
18370
|
+
// suppressing them like CONTROL tokens.
|
18371
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18372
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18373
|
+
result = llama_decode_text(result);
|
18374
|
+
if (length < (int) result.length()) {
|
18375
|
+
return -(int) result.length();
|
18376
|
+
}
|
18377
|
+
memcpy(buf, result.c_str(), result.length());
|
18378
|
+
return result.length();
|
18379
|
+
} else if (
|
18380
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18381
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18382
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18383
|
+
if (length < (int) result.length()) {
|
18384
|
+
return -(int) result.length();
|
18385
|
+
}
|
18386
|
+
memcpy(buf, result.c_str(), result.length());
|
18387
|
+
return result.length();
|
17998
18388
|
}
|
17999
|
-
|
18000
|
-
return result.length();
|
18389
|
+
break;
|
18001
18390
|
}
|
18002
|
-
|
18003
|
-
|
18004
|
-
default:
|
18005
|
-
GGML_ASSERT(false);
|
18391
|
+
default:
|
18392
|
+
GGML_ASSERT(false);
|
18006
18393
|
}
|
18007
18394
|
}
|
18008
18395
|
return 0;
|
@@ -18337,6 +18724,7 @@ const char * llama_print_system_info(void) {
|
|
18337
18724
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18338
18725
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18339
18726
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18727
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
18340
18728
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
18341
18729
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
18342
18730
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|