llama_cpp 0.15.3 → 0.15.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +4 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
- data/vendor/tmp/llama.cpp/ggml.c +145 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -3
- data/vendor/tmp/llama.cpp/llama.cpp +637 -249
- data/vendor/tmp/llama.cpp/llama.h +11 -5
- metadata +2 -2
@@ -103,7 +103,7 @@
|
|
103
103
|
#endif
|
104
104
|
|
105
105
|
#define LLAMA_MAX_NODES 8192
|
106
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 160
|
107
107
|
|
108
108
|
//
|
109
109
|
// logging
|
@@ -222,6 +222,7 @@ enum llm_arch {
|
|
222
222
|
LLM_ARCH_DBRX,
|
223
223
|
LLM_ARCH_OLMO,
|
224
224
|
LLM_ARCH_ARCTIC,
|
225
|
+
LLM_ARCH_DEEPSEEK2,
|
225
226
|
LLM_ARCH_UNKNOWN,
|
226
227
|
};
|
227
228
|
|
@@ -259,6 +260,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
259
260
|
{ LLM_ARCH_DBRX, "dbrx" },
|
260
261
|
{ LLM_ARCH_OLMO, "olmo" },
|
261
262
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
263
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
262
264
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
263
265
|
};
|
264
266
|
|
@@ -279,11 +281,15 @@ enum llm_kv {
|
|
279
281
|
LLM_KV_CONTEXT_LENGTH,
|
280
282
|
LLM_KV_EMBEDDING_LENGTH,
|
281
283
|
LLM_KV_BLOCK_COUNT,
|
284
|
+
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
282
285
|
LLM_KV_FEED_FORWARD_LENGTH,
|
286
|
+
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
283
287
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
284
288
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
285
289
|
LLM_KV_EXPERT_COUNT,
|
286
290
|
LLM_KV_EXPERT_USED_COUNT,
|
291
|
+
LLM_KV_EXPERT_SHARED_COUNT,
|
292
|
+
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
287
293
|
LLM_KV_POOLING_TYPE,
|
288
294
|
LLM_KV_LOGIT_SCALE,
|
289
295
|
|
@@ -296,6 +302,8 @@ enum llm_kv {
|
|
296
302
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
297
303
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
298
304
|
LLM_KV_ATTENTION_CAUSAL,
|
305
|
+
LLM_KV_ATTENTION_Q_LORA_RANK,
|
306
|
+
LLM_KV_ATTENTION_KV_LORA_RANK,
|
299
307
|
|
300
308
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
301
309
|
LLM_KV_ROPE_FREQ_BASE,
|
@@ -305,6 +313,7 @@ enum llm_kv {
|
|
305
313
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
306
314
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
307
315
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
316
|
+
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
308
317
|
|
309
318
|
LLM_KV_SPLIT_NO,
|
310
319
|
LLM_KV_SPLIT_COUNT,
|
@@ -353,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
353
362
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
354
363
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
355
364
|
|
356
|
-
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size"
|
357
|
-
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length"
|
358
|
-
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"
|
359
|
-
{ LLM_KV_BLOCK_COUNT, "%s.block_count"
|
360
|
-
{
|
361
|
-
{
|
362
|
-
{
|
363
|
-
{
|
364
|
-
{
|
365
|
-
{
|
366
|
-
{
|
365
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
366
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
367
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
368
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
369
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
370
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
371
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
372
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
373
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
374
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
375
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
376
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
377
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
378
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
379
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
367
380
|
|
368
381
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
369
382
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -374,6 +387,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
374
387
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
375
388
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
376
389
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
390
|
+
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
391
|
+
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
377
392
|
|
378
393
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
379
394
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
@@ -383,6 +398,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
383
398
|
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
384
399
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
385
400
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
401
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
386
402
|
|
387
403
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
388
404
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
@@ -474,6 +490,12 @@ enum llm_tensor {
|
|
474
490
|
LLM_TENSOR_SSM_A,
|
475
491
|
LLM_TENSOR_SSM_D,
|
476
492
|
LLM_TENSOR_SSM_OUT,
|
493
|
+
LLM_TENSOR_ATTN_Q_A,
|
494
|
+
LLM_TENSOR_ATTN_Q_B,
|
495
|
+
LLM_TENSOR_ATTN_KV_A_MQA,
|
496
|
+
LLM_TENSOR_ATTN_KV_B,
|
497
|
+
LLM_TENSOR_ATTN_Q_A_NORM,
|
498
|
+
LLM_TENSOR_ATTN_KV_A_NORM,
|
477
499
|
};
|
478
500
|
|
479
501
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
@@ -1057,6 +1079,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1057
1079
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
1080
|
},
|
1059
1081
|
},
|
1082
|
+
{
|
1083
|
+
LLM_ARCH_DEEPSEEK2,
|
1084
|
+
{
|
1085
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1086
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1087
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1088
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1089
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1090
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1091
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1092
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1093
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1094
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1095
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1096
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1097
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1098
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1099
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1100
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1101
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1102
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1103
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1104
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1105
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1106
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1107
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1108
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1109
|
+
},
|
1110
|
+
},
|
1060
1111
|
{
|
1061
1112
|
LLM_ARCH_UNKNOWN,
|
1062
1113
|
{
|
@@ -1651,12 +1702,13 @@ struct llama_mlock {
|
|
1651
1702
|
};
|
1652
1703
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
1653
1704
|
|
1654
|
-
|
1705
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
1706
|
+
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
1655
1707
|
std::vector<char> result(8, 0);
|
1656
|
-
const int n_tokens = llama_token_to_piece(
|
1708
|
+
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1657
1709
|
if (n_tokens < 0) {
|
1658
1710
|
result.resize(-n_tokens);
|
1659
|
-
int check = llama_token_to_piece(
|
1711
|
+
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
1660
1712
|
GGML_ASSERT(check == -n_tokens);
|
1661
1713
|
}
|
1662
1714
|
else {
|
@@ -1741,6 +1793,7 @@ enum e_model {
|
|
1741
1793
|
MODEL_13B,
|
1742
1794
|
MODEL_14B,
|
1743
1795
|
MODEL_15B,
|
1796
|
+
MODEL_16B,
|
1744
1797
|
MODEL_20B,
|
1745
1798
|
MODEL_30B,
|
1746
1799
|
MODEL_34B,
|
@@ -1748,6 +1801,7 @@ enum e_model {
|
|
1748
1801
|
MODEL_40B,
|
1749
1802
|
MODEL_65B,
|
1750
1803
|
MODEL_70B,
|
1804
|
+
MODEL_236B,
|
1751
1805
|
MODEL_314B,
|
1752
1806
|
MODEL_SMALL,
|
1753
1807
|
MODEL_MEDIUM,
|
@@ -1783,6 +1837,13 @@ struct llama_hparams {
|
|
1783
1837
|
uint32_t n_expert_used = 0;
|
1784
1838
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
1785
1839
|
|
1840
|
+
uint32_t n_layer_dense_lead = 0;
|
1841
|
+
uint32_t n_lora_q = 0;
|
1842
|
+
uint32_t n_lora_kv = 0;
|
1843
|
+
uint32_t n_ff_exp = 0;
|
1844
|
+
uint32_t n_expert_shared = 0;
|
1845
|
+
float expert_weights_scale = 0.0;
|
1846
|
+
|
1786
1847
|
float f_norm_eps;
|
1787
1848
|
float f_norm_rms_eps;
|
1788
1849
|
|
@@ -1790,6 +1851,7 @@ struct llama_hparams {
|
|
1790
1851
|
float rope_freq_base_train;
|
1791
1852
|
float rope_freq_scale_train;
|
1792
1853
|
uint32_t n_yarn_orig_ctx;
|
1854
|
+
float rope_yarn_log_mul;
|
1793
1855
|
|
1794
1856
|
// for State Space Models
|
1795
1857
|
uint32_t ssm_d_conv = 0;
|
@@ -1823,6 +1885,12 @@ struct llama_hparams {
|
|
1823
1885
|
if (this->n_expert != other.n_expert) return true;
|
1824
1886
|
if (this->n_expert_used != other.n_expert_used) return true;
|
1825
1887
|
|
1888
|
+
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
1889
|
+
if (this->n_lora_q != other.n_lora_q) return true;
|
1890
|
+
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1891
|
+
if (this->n_ff_exp != other.n_ff_exp) return true;
|
1892
|
+
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1893
|
+
|
1826
1894
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
1827
1895
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
1828
1896
|
|
@@ -1838,6 +1906,8 @@ struct llama_hparams {
|
|
1838
1906
|
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1839
1907
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1840
1908
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1909
|
+
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
1910
|
+
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
1841
1911
|
|
1842
1912
|
return false;
|
1843
1913
|
}
|
@@ -1913,6 +1983,8 @@ struct llama_layer {
|
|
1913
1983
|
struct ggml_tensor * attn_k_norm_b;
|
1914
1984
|
struct ggml_tensor * attn_out_norm;
|
1915
1985
|
struct ggml_tensor * attn_out_norm_b;
|
1986
|
+
struct ggml_tensor * attn_q_a_norm;
|
1987
|
+
struct ggml_tensor * attn_kv_a_norm;
|
1916
1988
|
|
1917
1989
|
// attention
|
1918
1990
|
struct ggml_tensor * wq;
|
@@ -1920,6 +1992,10 @@ struct llama_layer {
|
|
1920
1992
|
struct ggml_tensor * wv;
|
1921
1993
|
struct ggml_tensor * wo;
|
1922
1994
|
struct ggml_tensor * wqkv;
|
1995
|
+
struct ggml_tensor * wq_a;
|
1996
|
+
struct ggml_tensor * wq_b;
|
1997
|
+
struct ggml_tensor * wkv_a_mqa;
|
1998
|
+
struct ggml_tensor * wkv_b;
|
1923
1999
|
|
1924
2000
|
// attention bias
|
1925
2001
|
struct ggml_tensor * bq;
|
@@ -1953,8 +2029,9 @@ struct llama_layer {
|
|
1953
2029
|
struct ggml_tensor * ffn_up_shexp;
|
1954
2030
|
|
1955
2031
|
// ff bias
|
1956
|
-
struct ggml_tensor *
|
1957
|
-
struct ggml_tensor *
|
2032
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
2033
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
2034
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
1958
2035
|
struct ggml_tensor * ffn_act;
|
1959
2036
|
|
1960
2037
|
// mamba proj
|
@@ -2086,7 +2163,9 @@ struct llama_vocab {
|
|
2086
2163
|
std::unordered_map<token, id> token_to_id;
|
2087
2164
|
std::vector<token_data> id_to_token;
|
2088
2165
|
|
2089
|
-
std::
|
2166
|
+
std::vector<id> cache_special_tokens;
|
2167
|
+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
2168
|
+
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
2090
2169
|
|
2091
2170
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
2092
2171
|
|
@@ -3832,6 +3911,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3832
3911
|
case MODEL_13B: return "13B";
|
3833
3912
|
case MODEL_14B: return "14B";
|
3834
3913
|
case MODEL_15B: return "15B";
|
3914
|
+
case MODEL_16B: return "16B";
|
3835
3915
|
case MODEL_20B: return "20B";
|
3836
3916
|
case MODEL_30B: return "30B";
|
3837
3917
|
case MODEL_34B: return "34B";
|
@@ -3839,6 +3919,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
3839
3919
|
case MODEL_40B: return "40B";
|
3840
3920
|
case MODEL_65B: return "65B";
|
3841
3921
|
case MODEL_70B: return "70B";
|
3922
|
+
case MODEL_236B: return "236B";
|
3842
3923
|
case MODEL_314B: return "314B";
|
3843
3924
|
case MODEL_SMALL: return "0.1B";
|
3844
3925
|
case MODEL_MEDIUM: return "0.4B";
|
@@ -3981,7 +4062,9 @@ static void llm_load_hparams(
|
|
3981
4062
|
switch (hparams.n_layer) {
|
3982
4063
|
case 22: model.type = e_model::MODEL_1B; break;
|
3983
4064
|
case 26: model.type = e_model::MODEL_3B; break;
|
3984
|
-
|
4065
|
+
// granite uses a vocab with len 49152
|
4066
|
+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
4067
|
+
case 36: model.type = e_model::MODEL_8B; break; // granite
|
3985
4068
|
case 40: model.type = e_model::MODEL_13B; break;
|
3986
4069
|
case 48: model.type = e_model::MODEL_34B; break;
|
3987
4070
|
case 60: model.type = e_model::MODEL_30B; break;
|
@@ -4251,6 +4334,8 @@ static void llm_load_hparams(
|
|
4251
4334
|
case 30: model.type = e_model::MODEL_3B; break;
|
4252
4335
|
case 32: model.type = e_model::MODEL_7B; break;
|
4253
4336
|
case 40: model.type = e_model::MODEL_15B; break;
|
4337
|
+
case 52: model.type = e_model::MODEL_20B; break; // granite
|
4338
|
+
case 88: model.type = e_model::MODEL_34B; break; // granite
|
4254
4339
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4255
4340
|
}
|
4256
4341
|
} break;
|
@@ -4384,6 +4469,26 @@ static void llm_load_hparams(
|
|
4384
4469
|
model.type = e_model::MODEL_UNKNOWN;
|
4385
4470
|
}
|
4386
4471
|
} break;
|
4472
|
+
case LLM_ARCH_DEEPSEEK2:
|
4473
|
+
{
|
4474
|
+
bool is_lite = (hparams.n_layer == 27);
|
4475
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4476
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
4477
|
+
if (!is_lite) {
|
4478
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
4479
|
+
}
|
4480
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
4481
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
4482
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
4483
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
4484
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
4485
|
+
|
4486
|
+
switch (hparams.n_layer) {
|
4487
|
+
case 27: model.type = e_model::MODEL_16B; break;
|
4488
|
+
case 60: model.type = e_model::MODEL_236B; break;
|
4489
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4490
|
+
}
|
4491
|
+
} break;
|
4387
4492
|
default: (void)0;
|
4388
4493
|
}
|
4389
4494
|
|
@@ -4490,15 +4595,14 @@ static void llm_load_vocab(
|
|
4490
4595
|
vocab.special_cls_id = 101;
|
4491
4596
|
vocab.special_mask_id = 103;
|
4492
4597
|
vocab.add_space_prefix = false;
|
4493
|
-
} else {
|
4494
|
-
|
4495
|
-
|
4496
|
-
|
4497
|
-
|
4498
|
-
|
4499
|
-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
4500
|
-
return;
|
4598
|
+
} else if (tokenizer_model == "gpt2") {
|
4599
|
+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4600
|
+
|
4601
|
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4602
|
+
if (add_space_prefix_keyidx != -1) {
|
4603
|
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4501
4604
|
}
|
4605
|
+
|
4502
4606
|
// read bpe merges and populate bpe ranks
|
4503
4607
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
4504
4608
|
if (merges_keyidx == -1) {
|
@@ -4532,6 +4636,8 @@ static void llm_load_vocab(
|
|
4532
4636
|
vocab.special_pad_id = -1;
|
4533
4637
|
vocab.special_cls_id = -1;
|
4534
4638
|
vocab.special_mask_id = -1;
|
4639
|
+
} else {
|
4640
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
4535
4641
|
}
|
4536
4642
|
|
4537
4643
|
// for now, only BPE models have pre-tokenizers
|
@@ -4593,6 +4699,9 @@ static void llm_load_vocab(
|
|
4593
4699
|
} else if (
|
4594
4700
|
tokenizer_pre == "dbrx") {
|
4595
4701
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
4702
|
+
} else if (
|
4703
|
+
tokenizer_pre == "smaug-bpe") {
|
4704
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
4596
4705
|
} else {
|
4597
4706
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4598
4707
|
}
|
@@ -4721,97 +4830,40 @@ static void llm_load_vocab(
|
|
4721
4830
|
|
4722
4831
|
// build special tokens cache
|
4723
4832
|
{
|
4724
|
-
|
4725
|
-
// and will always be correctly labeled in 'added_tokens.json' etc.
|
4726
|
-
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
4727
|
-
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
4728
|
-
// are special tokens.
|
4729
|
-
// From testing, this appears to correlate 1:1 with special tokens.
|
4730
|
-
//
|
4731
|
-
|
4732
|
-
// Counting special tokens and verifying in only one direction
|
4733
|
-
// is sufficient to detect difference in those two sets.
|
4734
|
-
//
|
4735
|
-
uint32_t special_tokens_count_by_type = 0;
|
4736
|
-
uint32_t special_tokens_count_from_verification = 0;
|
4737
|
-
|
4738
|
-
bool special_tokens_definition_mismatch = false;
|
4739
|
-
|
4740
|
-
for (const auto & t : vocab.token_to_id) {
|
4741
|
-
const auto & token = t.first;
|
4742
|
-
const auto & id = t.second;
|
4743
|
-
|
4744
|
-
// Count all non-normal tokens in the vocab while iterating
|
4833
|
+
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
4745
4834
|
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
4746
|
-
|
4835
|
+
vocab.cache_special_tokens.push_back(id);
|
4747
4836
|
}
|
4837
|
+
}
|
4748
4838
|
|
4749
|
-
|
4750
|
-
|
4751
|
-
|
4752
|
-
|
4753
|
-
|
4754
|
-
// and check if both halves can be matched to a valid token
|
4755
|
-
for (unsigned i = 1; i < token.length();) {
|
4756
|
-
const auto left = token.substr(0, i);
|
4757
|
-
const auto right = token.substr(i);
|
4758
|
-
|
4759
|
-
// check if we didnt partition in the middle of a utf sequence
|
4760
|
-
auto utf = utf8_len(left.at(left.length() - 1));
|
4761
|
-
|
4762
|
-
if (utf == 1) {
|
4763
|
-
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
4764
|
-
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
4765
|
-
is_tokenizable = true;
|
4766
|
-
break;
|
4767
|
-
}
|
4768
|
-
i++;
|
4769
|
-
} else {
|
4770
|
-
// skip over the rest of multibyte utf sequence
|
4771
|
-
i += utf - 1;
|
4772
|
-
}
|
4773
|
-
}
|
4839
|
+
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
4840
|
+
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
4841
|
+
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
4842
|
+
}
|
4843
|
+
);
|
4774
4844
|
|
4775
|
-
|
4776
|
-
|
4777
|
-
// it's faster to re-filter them here, since there are way less candidates now
|
4845
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
4846
|
+
}
|
4778
4847
|
|
4779
|
-
|
4780
|
-
|
4781
|
-
|
4782
|
-
utf8_str_len++;
|
4783
|
-
i += utf8_len(token.at(i));
|
4784
|
-
}
|
4848
|
+
// build token to piece caches
|
4849
|
+
{
|
4850
|
+
size_t size_cache = 0;
|
4785
4851
|
|
4786
|
-
|
4787
|
-
|
4788
|
-
// At this point what we have left are special tokens only
|
4789
|
-
vocab.special_tokens_cache[token] = id;
|
4852
|
+
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
4853
|
+
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
4790
4854
|
|
4791
|
-
|
4792
|
-
|
4855
|
+
for (uint32_t id = 0; id < n_vocab; ++id) {
|
4856
|
+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
4857
|
+
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
4793
4858
|
|
4794
|
-
|
4795
|
-
|
4796
|
-
special_tokens_definition_mismatch = true;
|
4797
|
-
}
|
4798
|
-
}
|
4799
|
-
}
|
4800
|
-
}
|
4859
|
+
size_cache += cache_token_to_piece[id].size();
|
4860
|
+
size_cache += cache_token_to_piece_special[id].size();
|
4801
4861
|
}
|
4802
4862
|
|
4803
|
-
|
4804
|
-
|
4805
|
-
|
4806
|
-
|
4807
|
-
special_tokens_count_by_type, vocab.id_to_token.size()
|
4808
|
-
);
|
4809
|
-
} else {
|
4810
|
-
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
4811
|
-
__func__,
|
4812
|
-
special_tokens_count_from_verification, vocab.id_to_token.size()
|
4813
|
-
);
|
4814
|
-
}
|
4863
|
+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
4864
|
+
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
4865
|
+
|
4866
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
4815
4867
|
}
|
4816
4868
|
}
|
4817
4869
|
|
@@ -4892,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
4892
4944
|
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
4893
4945
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
4894
4946
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
4947
|
+
|
4948
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
4949
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
4950
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
4951
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
4952
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
4953
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
4954
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
4955
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
4956
|
+
}
|
4895
4957
|
}
|
4896
4958
|
|
4897
4959
|
// Returns false if cancelled by progress_callback
|
@@ -5048,8 +5110,6 @@ static bool llm_load_tensors(
|
|
5048
5110
|
throw std::runtime_error("model has expert layers but no expert layers are used");
|
5049
5111
|
}
|
5050
5112
|
|
5051
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
5052
|
-
|
5053
5113
|
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
5054
5114
|
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
5055
5115
|
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
@@ -5103,6 +5163,11 @@ static bool llm_load_tensors(
|
|
5103
5163
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5104
5164
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
5105
5165
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5166
|
+
|
5167
|
+
// optional MLP bias
|
5168
|
+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5169
|
+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5106
5171
|
} else {
|
5107
5172
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5108
5173
|
|
@@ -6210,6 +6275,70 @@ static bool llm_load_tensors(
|
|
6210
6275
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
6276
|
}
|
6212
6277
|
} break;
|
6278
|
+
case LLM_ARCH_DEEPSEEK2:
|
6279
|
+
{
|
6280
|
+
bool is_lite = (hparams.n_layer == 27);
|
6281
|
+
|
6282
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
6283
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6284
|
+
const uint32_t q_lora_rank = hparams.n_lora_q;
|
6285
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
6286
|
+
const uint32_t n_ff_exp = hparams.n_ff_exp;
|
6287
|
+
|
6288
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6289
|
+
|
6290
|
+
// output
|
6291
|
+
{
|
6292
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6293
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6294
|
+
}
|
6295
|
+
|
6296
|
+
for (int i = 0; i < n_layer; ++i) {
|
6297
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6298
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6299
|
+
|
6300
|
+
auto & layer = model.layers[i];
|
6301
|
+
|
6302
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6303
|
+
if (!is_lite) {
|
6304
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
6305
|
+
}
|
6306
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
6307
|
+
|
6308
|
+
if (!is_lite) {
|
6309
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
6310
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
|
6311
|
+
} else {
|
6312
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
6313
|
+
}
|
6314
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
|
6315
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
|
6316
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
|
6317
|
+
|
6318
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6319
|
+
|
6320
|
+
if ((uint32_t) i < hparams.n_layer_dense_lead) {
|
6321
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
6322
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
6323
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6324
|
+
} else {
|
6325
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6326
|
+
|
6327
|
+
GGML_ASSERT(hparams.n_expert > 0);
|
6328
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
6329
|
+
|
6330
|
+
// MoE branch
|
6331
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6332
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
6333
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
6334
|
+
|
6335
|
+
// Shared expert branch
|
6336
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6337
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
|
6338
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
|
6339
|
+
}
|
6340
|
+
}
|
6341
|
+
} break;
|
6213
6342
|
default:
|
6214
6343
|
throw std::runtime_error("unknown architecture");
|
6215
6344
|
}
|
@@ -6664,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6664
6793
|
int64_t n_expert_used,
|
6665
6794
|
llm_ffn_op_type type_op,
|
6666
6795
|
bool norm_w,
|
6796
|
+
bool scale_w,
|
6797
|
+
float w_scale,
|
6667
6798
|
const llm_build_cb & cb,
|
6668
6799
|
int il) {
|
6669
6800
|
int64_t n_embd = cur->ne[0];
|
@@ -6695,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|
6695
6826
|
|
6696
6827
|
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
6697
6828
|
}
|
6829
|
+
if (scale_w) {
|
6830
|
+
weights = ggml_scale(ctx, weights, w_scale);
|
6831
|
+
cb(weights, "ffn_moe_weights_scaled", il);
|
6832
|
+
}
|
6698
6833
|
|
6699
6834
|
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
6700
6835
|
ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
@@ -7305,9 +7440,9 @@ struct llm_build_context {
|
|
7305
7440
|
cb(cur, "ffn_norm", il);
|
7306
7441
|
|
7307
7442
|
cur = llm_build_ffn(ctx0, cur,
|
7308
|
-
model.layers[il].ffn_up,
|
7309
|
-
model.layers[il].ffn_gate,
|
7310
|
-
model.layers[il].ffn_down,
|
7443
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
7444
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
|
7445
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
7311
7446
|
NULL,
|
7312
7447
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
7313
7448
|
cb(cur, "ffn_out", il);
|
@@ -7325,6 +7460,7 @@ struct llm_build_context {
|
|
7325
7460
|
model.layers[il].ffn_down_exps,
|
7326
7461
|
n_expert, n_expert_used,
|
7327
7462
|
LLM_FFN_SILU, true,
|
7463
|
+
false, 0.0,
|
7328
7464
|
cb, il);
|
7329
7465
|
cb(cur, "ffn_moe_out", il);
|
7330
7466
|
}
|
@@ -7806,6 +7942,7 @@ struct llm_build_context {
|
|
7806
7942
|
model.layers[il].ffn_down_exps,
|
7807
7943
|
n_expert, n_expert_used,
|
7808
7944
|
LLM_FFN_GELU, true,
|
7945
|
+
false, 0.0,
|
7809
7946
|
cb, il);
|
7810
7947
|
cb(cur, "ffn_moe_out", il);
|
7811
7948
|
|
@@ -7949,6 +8086,7 @@ struct llm_build_context {
|
|
7949
8086
|
model.layers[il].ffn_down_exps,
|
7950
8087
|
n_expert, n_expert_used,
|
7951
8088
|
LLM_FFN_SILU, true,
|
8089
|
+
false, 0.0,
|
7952
8090
|
cb, il);
|
7953
8091
|
cb(cur, "ffn_moe_out", il);
|
7954
8092
|
|
@@ -9087,6 +9225,7 @@ struct llm_build_context {
|
|
9087
9225
|
model.layers[il].ffn_down_exps,
|
9088
9226
|
n_expert, n_expert_used,
|
9089
9227
|
LLM_FFN_SILU, false,
|
9228
|
+
false, 0.0,
|
9090
9229
|
cb, il);
|
9091
9230
|
cb(cur, "ffn_moe_out", il);
|
9092
9231
|
|
@@ -10974,6 +11113,7 @@ struct llm_build_context {
|
|
10974
11113
|
model.layers[il].ffn_down_exps,
|
10975
11114
|
n_expert, n_expert_used,
|
10976
11115
|
LLM_FFN_SILU, true,
|
11116
|
+
false, 0.0,
|
10977
11117
|
cb, il);
|
10978
11118
|
cb(cur, "ffn_moe_out", il);
|
10979
11119
|
|
@@ -11005,6 +11145,239 @@ struct llm_build_context {
|
|
11005
11145
|
|
11006
11146
|
return gf;
|
11007
11147
|
}
|
11148
|
+
|
11149
|
+
struct ggml_cgraph * build_deepseek2() {
|
11150
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
11151
|
+
|
11152
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11153
|
+
int32_t n_tokens = this->n_tokens;
|
11154
|
+
|
11155
|
+
bool is_lite = (hparams.n_layer == 27);
|
11156
|
+
|
11157
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
11158
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
11159
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
11160
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
|
11161
|
+
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
11162
|
+
|
11163
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
11164
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
11165
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
11166
|
+
|
11167
|
+
struct ggml_tensor * cur;
|
11168
|
+
struct ggml_tensor * inpL;
|
11169
|
+
|
11170
|
+
// {n_embd, n_tokens}
|
11171
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
11172
|
+
|
11173
|
+
// inp_pos - contains the positions
|
11174
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
11175
|
+
|
11176
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11177
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11178
|
+
|
11179
|
+
for (int il = 0; il < n_layer; ++il) {
|
11180
|
+
struct ggml_tensor * inpSA = inpL;
|
11181
|
+
|
11182
|
+
// norm
|
11183
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11184
|
+
model.layers[il].attn_norm, NULL,
|
11185
|
+
LLM_NORM_RMS, cb, il);
|
11186
|
+
cb(cur, "attn_norm", il);
|
11187
|
+
|
11188
|
+
// self_attention
|
11189
|
+
{
|
11190
|
+
struct ggml_tensor * q = NULL;
|
11191
|
+
if (!is_lite) {
|
11192
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
11193
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
11194
|
+
cb(q, "q", il);
|
11195
|
+
|
11196
|
+
q = llm_build_norm(ctx0, q, hparams,
|
11197
|
+
model.layers[il].attn_q_a_norm, NULL,
|
11198
|
+
LLM_NORM_RMS, cb, il);
|
11199
|
+
cb(q, "q", il);
|
11200
|
+
|
11201
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
11202
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
11203
|
+
cb(q, "q", il);
|
11204
|
+
} else {
|
11205
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
11206
|
+
cb(q, "q", il);
|
11207
|
+
}
|
11208
|
+
|
11209
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11210
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
11211
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11212
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11213
|
+
0);
|
11214
|
+
cb(q_nope, "q_nope", il);
|
11215
|
+
|
11216
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
11217
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
11218
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
11219
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
11220
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
11221
|
+
cb(q_pe, "q_pe", il);
|
11222
|
+
|
11223
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
11224
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
11225
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
11226
|
+
|
11227
|
+
// split into {kv_lora_rank, n_tokens}
|
11228
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
11229
|
+
kv_pe_compresseed->nb[1],
|
11230
|
+
0);
|
11231
|
+
cb(kv_compressed, "kv_compressed", il);
|
11232
|
+
|
11233
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
11234
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
11235
|
+
kv_pe_compresseed->nb[1],
|
11236
|
+
kv_pe_compresseed->nb[1],
|
11237
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
11238
|
+
cb(k_pe, "k_pe", il);
|
11239
|
+
|
11240
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
11241
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
11242
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
11243
|
+
LLM_NORM_RMS, cb, il);
|
11244
|
+
cb(kv_compressed, "kv_compressed", il);
|
11245
|
+
|
11246
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
11247
|
+
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
11248
|
+
cb(kv, "kv", il);
|
11249
|
+
|
11250
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
11251
|
+
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
11252
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
11253
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11254
|
+
0);
|
11255
|
+
cb(k_nope, "k_nope", il);
|
11256
|
+
|
11257
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
11258
|
+
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
11259
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
11260
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
11261
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
11262
|
+
cb(v_states, "v_states", il);
|
11263
|
+
|
11264
|
+
v_states = ggml_cont(ctx0, v_states);
|
11265
|
+
cb(v_states, "v_states", il);
|
11266
|
+
|
11267
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
11268
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
11269
|
+
0);
|
11270
|
+
cb(v_states, "v_states", il);
|
11271
|
+
|
11272
|
+
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11273
|
+
q_pe = ggml_rope_ext(
|
11274
|
+
ctx0, q_pe, inp_pos, nullptr,
|
11275
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11276
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11277
|
+
);
|
11278
|
+
cb(q_pe, "q_pe", il);
|
11279
|
+
|
11280
|
+
// shared RoPE key
|
11281
|
+
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
11282
|
+
k_pe = ggml_rope_ext(
|
11283
|
+
ctx0, k_pe, inp_pos, nullptr,
|
11284
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
11285
|
+
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
11286
|
+
);
|
11287
|
+
cb(k_pe, "k_pe", il);
|
11288
|
+
|
11289
|
+
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
11290
|
+
cb(q_states, "q_states", il);
|
11291
|
+
|
11292
|
+
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
11293
|
+
cb(k_states, "k_states", il);
|
11294
|
+
|
11295
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
11296
|
+
model.layers[il].wo, NULL,
|
11297
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11298
|
+
}
|
11299
|
+
|
11300
|
+
if (il == n_layer - 1) {
|
11301
|
+
// skip computing output for unused tokens
|
11302
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11303
|
+
n_tokens = n_outputs;
|
11304
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
11305
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11306
|
+
}
|
11307
|
+
|
11308
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
11309
|
+
cb(ffn_inp, "ffn_inp", il);
|
11310
|
+
|
11311
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
11312
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11313
|
+
model.layers[il].ffn_norm, NULL,
|
11314
|
+
LLM_NORM_RMS, cb, il);
|
11315
|
+
cb(cur, "ffn_norm", il);
|
11316
|
+
|
11317
|
+
cur = llm_build_ffn(ctx0, cur,
|
11318
|
+
model.layers[il].ffn_up, NULL,
|
11319
|
+
model.layers[il].ffn_gate, NULL,
|
11320
|
+
model.layers[il].ffn_down, NULL,
|
11321
|
+
NULL,
|
11322
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11323
|
+
cb(cur, "ffn_out", il);
|
11324
|
+
} else {
|
11325
|
+
// MoE branch
|
11326
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11327
|
+
model.layers[il].ffn_norm, NULL,
|
11328
|
+
LLM_NORM_RMS, cb, il);
|
11329
|
+
cb(cur, "ffn_norm", il);
|
11330
|
+
|
11331
|
+
ggml_tensor * moe_out =
|
11332
|
+
llm_build_moe_ffn(ctx0, cur,
|
11333
|
+
model.layers[il].ffn_gate_inp,
|
11334
|
+
model.layers[il].ffn_up_exps,
|
11335
|
+
model.layers[il].ffn_gate_exps,
|
11336
|
+
model.layers[il].ffn_down_exps,
|
11337
|
+
n_expert, n_expert_used,
|
11338
|
+
LLM_FFN_SILU, false,
|
11339
|
+
true, hparams.expert_weights_scale,
|
11340
|
+
cb, il);
|
11341
|
+
cb(moe_out, "ffn_moe_out", il);
|
11342
|
+
|
11343
|
+
// FFN shared expert
|
11344
|
+
{
|
11345
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
|
11346
|
+
model.layers[il].ffn_up_shexp, NULL,
|
11347
|
+
model.layers[il].ffn_gate_shexp, NULL,
|
11348
|
+
model.layers[il].ffn_down_shexp, NULL,
|
11349
|
+
NULL,
|
11350
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11351
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
11352
|
+
|
11353
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
11354
|
+
cb(cur, "ffn_out", il);
|
11355
|
+
}
|
11356
|
+
}
|
11357
|
+
|
11358
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
11359
|
+
cb(cur, "l_out", il);
|
11360
|
+
|
11361
|
+
// input for next layer
|
11362
|
+
inpL = cur;
|
11363
|
+
}
|
11364
|
+
|
11365
|
+
cur = inpL;
|
11366
|
+
|
11367
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11368
|
+
model.output_norm, NULL,
|
11369
|
+
LLM_NORM_RMS, cb, -1);
|
11370
|
+
cb(cur, "result_norm", -1);
|
11371
|
+
|
11372
|
+
// lm_head
|
11373
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11374
|
+
cb(cur, "result_output", -1);
|
11375
|
+
|
11376
|
+
ggml_build_forward_expand(gf, cur);
|
11377
|
+
|
11378
|
+
return gf;
|
11379
|
+
}
|
11380
|
+
|
11008
11381
|
};
|
11009
11382
|
|
11010
11383
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -11223,6 +11596,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11223
11596
|
{
|
11224
11597
|
result = llm.build_arctic();
|
11225
11598
|
} break;
|
11599
|
+
case LLM_ARCH_DEEPSEEK2:
|
11600
|
+
{
|
11601
|
+
result = llm.build_deepseek2();
|
11602
|
+
} break;
|
11226
11603
|
default:
|
11227
11604
|
GGML_ASSERT(false);
|
11228
11605
|
}
|
@@ -12512,6 +12889,7 @@ struct llm_tokenizer_bpe {
|
|
12512
12889
|
});
|
12513
12890
|
break;
|
12514
12891
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
12892
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
12515
12893
|
word_collection = unicode_regex_split(text, {
|
12516
12894
|
// same as llama3
|
12517
12895
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
@@ -12734,7 +13112,7 @@ struct llm_tokenizer_wpm {
|
|
12734
13112
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
12735
13113
|
|
12736
13114
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
12737
|
-
auto
|
13115
|
+
const auto & token_map = vocab.token_to_id;
|
12738
13116
|
|
12739
13117
|
// normalize and split by whitespace
|
12740
13118
|
std::vector<std::string> words = preprocess(text);
|
@@ -12749,108 +13127,89 @@ struct llm_tokenizer_wpm {
|
|
12749
13127
|
}
|
12750
13128
|
|
12751
13129
|
// prepend phantom space
|
12752
|
-
std::string word1 = "\xe2\x96\x81" + word;
|
12753
|
-
int n = word1.size();
|
13130
|
+
const std::string word1 = "\xe2\x96\x81" + word;
|
13131
|
+
const int n = word1.size();
|
12754
13132
|
|
12755
|
-
|
12756
|
-
int i = 0;
|
12757
|
-
bool match_any = false;
|
13133
|
+
const size_t current_tokens = output.size();
|
12758
13134
|
|
13135
|
+
// we're at the start of a new word
|
12759
13136
|
// move through character position in word
|
12760
|
-
|
13137
|
+
for (int i = 0; i < n; ++i) {
|
12761
13138
|
// loop through possible match length
|
12762
13139
|
bool match = false;
|
12763
13140
|
for (int j = n; j > i; j--) {
|
12764
|
-
auto it = token_map
|
12765
|
-
if (it != token_map
|
13141
|
+
auto it = token_map.find(word1.substr(i, j - i));
|
13142
|
+
if (it != token_map.end()) {
|
12766
13143
|
output.push_back(it->second);
|
12767
13144
|
match = true;
|
12768
|
-
|
12769
|
-
i = j;
|
13145
|
+
i = j - 1;
|
12770
13146
|
break;
|
12771
13147
|
}
|
12772
13148
|
}
|
12773
13149
|
|
12774
|
-
|
12775
|
-
|
12776
|
-
|
13150
|
+
if (!match) { // discard all
|
13151
|
+
output.resize(current_tokens);
|
13152
|
+
break; // and discard next tokens
|
12777
13153
|
}
|
12778
13154
|
}
|
12779
13155
|
|
12780
13156
|
// we didn't find any matches for this word
|
12781
|
-
if (
|
13157
|
+
if (current_tokens == output.size()) {
|
12782
13158
|
output.push_back(vocab.special_unk_id);
|
12783
13159
|
}
|
12784
13160
|
}
|
12785
13161
|
}
|
12786
13162
|
|
12787
13163
|
std::vector<std::string> preprocess(const std::string & text) {
|
12788
|
-
std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
12789
|
-
|
12790
|
-
|
12791
|
-
|
12792
|
-
|
12793
|
-
|
12794
|
-
|
12795
|
-
|
13164
|
+
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13165
|
+
std::vector<std::string> words(1, "");
|
13166
|
+
|
13167
|
+
for (const char32_t cpt : cpts_nfd) {
|
13168
|
+
const auto flags = unicode_cpt_flags(cpt);
|
13169
|
+
|
13170
|
+
if (flags.is_whitespace) {
|
13171
|
+
if (words.back().size()) { // finish previous word if any
|
13172
|
+
words.emplace_back();
|
13173
|
+
}
|
12796
13174
|
continue;
|
12797
13175
|
}
|
12798
|
-
|
12799
|
-
|
12800
|
-
|
12801
|
-
|
12802
|
-
std::string s = unicode_cpt_to_utf8(code);
|
12803
|
-
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
12804
|
-
new_str += " ";
|
12805
|
-
new_str += s;
|
12806
|
-
new_str += " ";
|
12807
|
-
} else {
|
12808
|
-
new_str += s;
|
13176
|
+
|
13177
|
+
assert (!flags.is_separator);
|
13178
|
+
if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
|
13179
|
+
continue;
|
12809
13180
|
}
|
12810
|
-
}
|
12811
13181
|
|
12812
|
-
|
12813
|
-
|
12814
|
-
|
12815
|
-
|
12816
|
-
|
12817
|
-
|
12818
|
-
|
12819
|
-
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
12820
|
-
l = r + 1;
|
12821
|
-
r = l;
|
13182
|
+
const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
|
13183
|
+
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
|
13184
|
+
if (words.back().size()) { // finish previous word if any
|
13185
|
+
words.emplace_back();
|
13186
|
+
}
|
13187
|
+
words.back() = s; // single char word
|
13188
|
+
words.emplace_back(); // start a new word
|
12822
13189
|
} else {
|
12823
|
-
|
13190
|
+
words.back() += s; // append char to word
|
12824
13191
|
}
|
12825
13192
|
}
|
12826
|
-
if (r > l) {
|
12827
|
-
words.push_back(new_str.substr(l, (r - l)));
|
12828
|
-
}
|
12829
|
-
return words;
|
12830
|
-
}
|
12831
13193
|
|
12832
|
-
|
12833
|
-
|
12834
|
-
return false;
|
13194
|
+
if (!words.back().size()) {
|
13195
|
+
words.pop_back();
|
12835
13196
|
}
|
12836
|
-
|
12837
|
-
return
|
13197
|
+
|
13198
|
+
return words;
|
12838
13199
|
}
|
12839
13200
|
|
12840
|
-
bool is_chinese_char(uint32_t cpt) {
|
12841
|
-
|
12842
|
-
(cpt >=
|
13201
|
+
static bool is_chinese_char(uint32_t cpt) {
|
13202
|
+
return
|
13203
|
+
(cpt >= 0x04E00 && cpt <= 0x09FFF) ||
|
13204
|
+
(cpt >= 0x03400 && cpt <= 0x04DBF) ||
|
12843
13205
|
(cpt >= 0x20000 && cpt <= 0x2A6DF) ||
|
12844
13206
|
(cpt >= 0x2A700 && cpt <= 0x2B73F) ||
|
12845
13207
|
(cpt >= 0x2B740 && cpt <= 0x2B81F) ||
|
12846
13208
|
(cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
12847
|
-
(cpt >=
|
12848
|
-
(cpt >= 0x2F800 && cpt <= 0x2FA1F)
|
12849
|
-
(cpt >= 0x3000 && cpt <= 0x303F) ||
|
12850
|
-
(cpt >= 0xFF00 && cpt <= 0xFFEF)
|
12851
|
-
return true; // NOLINT
|
12852
|
-
}
|
12853
|
-
return false;
|
13209
|
+
(cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
|
13210
|
+
(cpt >= 0x2F800 && cpt <= 0x2FA1F);
|
13211
|
+
//(cpt >= 0x3000 && cpt <= 0x303F) ||
|
13212
|
+
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
|
12854
13213
|
}
|
12855
13214
|
|
12856
13215
|
const llama_vocab & vocab;
|
@@ -12894,9 +13253,8 @@ struct fragment_buffer_variant {
|
|
12894
13253
|
|
12895
13254
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
12896
13255
|
// for each special token
|
12897
|
-
for (const
|
12898
|
-
const auto & special_token =
|
12899
|
-
const auto & special_id = st.second;
|
13256
|
+
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
13257
|
+
const auto & special_token = vocab.id_to_token[special_id].text;
|
12900
13258
|
|
12901
13259
|
// for each text fragment
|
12902
13260
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
@@ -12905,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12905
13263
|
|
12906
13264
|
// if a fragment is text ( not yet processed )
|
12907
13265
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
12908
|
-
auto
|
13266
|
+
auto & raw_text = fragment.raw_text;
|
12909
13267
|
|
12910
13268
|
auto raw_text_base_offset = fragment.offset;
|
12911
13269
|
auto raw_text_base_length = fragment.length;
|
@@ -12915,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12915
13273
|
// find the first occurrence of a given special token in this fragment
|
12916
13274
|
// passing offset argument only limit the "search area" but match coordinates
|
12917
13275
|
// are still relative to the source full raw_text
|
12918
|
-
auto match = raw_text
|
13276
|
+
auto match = raw_text.find(special_token, raw_text_base_offset);
|
12919
13277
|
|
12920
13278
|
// no occurrences found, stop processing this fragment for a given special token
|
12921
13279
|
if (match == std::string::npos) break;
|
@@ -12934,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12934
13292
|
// left
|
12935
13293
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
12936
13294
|
const int64_t left_reminder_length = match - raw_text_base_offset;
|
12937
|
-
buffer.emplace_after(it,
|
13295
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
12938
13296
|
|
12939
13297
|
#ifdef PRETOKENIZERDEBUG
|
12940
13298
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
@@ -12950,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
12950
13308
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
12951
13309
|
const int64_t right_reminder_offset = match + special_token.length();
|
12952
13310
|
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
12953
|
-
buffer.emplace_after(it,
|
13311
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
12954
13312
|
|
12955
13313
|
#ifdef PRETOKENIZERDEBUG
|
12956
13314
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
@@ -14054,7 +14412,7 @@ void llama_sample_repetition_penalties(
|
|
14054
14412
|
|
14055
14413
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
14056
14414
|
GGML_ASSERT(ctx);
|
14057
|
-
|
14415
|
+
int64_t t_start_sample_us = ggml_time_us();
|
14058
14416
|
|
14059
14417
|
bool allow_eog = false;
|
14060
14418
|
for (const auto & stack : grammar->stacks) {
|
@@ -14066,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
14066
14424
|
|
14067
14425
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
14068
14426
|
candidates_decoded.reserve(candidates->size);
|
14069
|
-
|
14427
|
+
|
14428
|
+
std::vector<llama_grammar_candidate> candidates_grammar;
|
14070
14429
|
candidates_grammar.reserve(candidates->size);
|
14071
14430
|
|
14072
14431
|
for (size_t i = 0; i < candidates->size; ++i) {
|
14073
|
-
const llama_token id
|
14074
|
-
const std::string piece =
|
14432
|
+
const llama_token id = candidates->data[i].id;
|
14433
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
14075
14434
|
|
14076
14435
|
if (llama_token_is_eog(&ctx->model, id)) {
|
14077
14436
|
if (!allow_eog) {
|
@@ -14271,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
14271
14630
|
GGML_ASSERT(false);
|
14272
14631
|
}
|
14273
14632
|
|
14274
|
-
const std::string piece =
|
14633
|
+
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
14275
14634
|
|
14276
14635
|
// Note terminating 0 in decoded string
|
14277
14636
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -16235,6 +16594,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16235
16594
|
case LLM_ARCH_COMMAND_R:
|
16236
16595
|
case LLM_ARCH_OLMO:
|
16237
16596
|
case LLM_ARCH_ARCTIC:
|
16597
|
+
case LLM_ARCH_DEEPSEEK2:
|
16238
16598
|
return LLAMA_ROPE_TYPE_NORM;
|
16239
16599
|
|
16240
16600
|
// the pairs of head values are offset by n_rot/2
|
@@ -17861,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
17861
18221
|
);
|
17862
18222
|
}
|
17863
18223
|
|
18224
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
18225
|
+
return llama_is_control_token(model->vocab, token);
|
18226
|
+
}
|
18227
|
+
|
17864
18228
|
llama_token llama_token_bos(const struct llama_model * model) {
|
17865
18229
|
return model->vocab.special_bos_id;
|
17866
18230
|
}
|
@@ -17932,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17932
18296
|
|
17933
18297
|
const auto cpts = unicode_cpts_from_utf8(text);
|
17934
18298
|
for (const auto cpt : cpts) {
|
17935
|
-
|
18299
|
+
const auto utf8 = unicode_cpt_to_utf8(cpt);
|
18300
|
+
try {
|
18301
|
+
decoded_text += unicode_utf8_to_byte(utf8);
|
18302
|
+
} catch (const std::out_of_range & e) {
|
18303
|
+
decoded_text += "[UNK_BYTE_0x";
|
18304
|
+
for (const auto c : utf8) {
|
18305
|
+
decoded_text += format("%02x", (uint8_t) c);
|
18306
|
+
}
|
18307
|
+
decoded_text += text + "]";
|
18308
|
+
}
|
17936
18309
|
}
|
17937
18310
|
|
17938
18311
|
return decoded_text;
|
@@ -17940,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
|
|
17940
18313
|
|
17941
18314
|
// does not write null-terminator to buf
|
17942
18315
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
18316
|
+
// if we have a cache - use it
|
18317
|
+
{
|
18318
|
+
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
18319
|
+
|
18320
|
+
if (!cache.empty()) {
|
18321
|
+
const auto & res = cache.at(token);
|
18322
|
+
if (length < (int) res.size()) {
|
18323
|
+
return -(int) res.size();
|
18324
|
+
}
|
18325
|
+
memcpy(buf, res.c_str(), res.size());
|
18326
|
+
return res.size();
|
18327
|
+
}
|
18328
|
+
}
|
18329
|
+
|
17943
18330
|
if (0 <= token && token < llama_n_vocab(model)) {
|
17944
18331
|
switch (llama_vocab_get_type(model->vocab)) {
|
17945
|
-
|
17946
|
-
|
17947
|
-
|
17948
|
-
|
17949
|
-
|
17950
|
-
|
17951
|
-
|
17952
|
-
|
17953
|
-
|
17954
|
-
|
17955
|
-
|
17956
|
-
|
17957
|
-
|
17958
|
-
|
17959
|
-
|
17960
|
-
|
17961
|
-
|
17962
|
-
|
17963
|
-
|
17964
|
-
|
17965
|
-
|
17966
|
-
|
17967
|
-
|
17968
|
-
|
17969
|
-
|
17970
|
-
|
17971
|
-
|
17972
|
-
|
17973
|
-
|
17974
|
-
|
18332
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
18333
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
18334
|
+
// NOTE: we accept all unsupported token types,
|
18335
|
+
// suppressing them like CONTROL tokens.
|
18336
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18337
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18338
|
+
llama_unescape_whitespace(result);
|
18339
|
+
if (length < (int) result.length()) {
|
18340
|
+
return -(int) result.length();
|
18341
|
+
}
|
18342
|
+
memcpy(buf, result.c_str(), result.length());
|
18343
|
+
return result.length();
|
18344
|
+
} else if (
|
18345
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18346
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18347
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18348
|
+
if (length < (int) result.length()) {
|
18349
|
+
return -(int) result.length();
|
18350
|
+
}
|
18351
|
+
memcpy(buf, result.c_str(), result.length());
|
18352
|
+
return result.length();
|
18353
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
18354
|
+
if (length < 3) {
|
18355
|
+
return -3;
|
18356
|
+
}
|
18357
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
18358
|
+
return 3;
|
18359
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
18360
|
+
if (length < 1) {
|
18361
|
+
return -1;
|
18362
|
+
}
|
18363
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
18364
|
+
return 1;
|
17975
18365
|
}
|
17976
|
-
|
17977
|
-
return 1;
|
18366
|
+
break;
|
17978
18367
|
}
|
17979
|
-
|
17980
|
-
|
17981
|
-
|
17982
|
-
|
17983
|
-
|
17984
|
-
|
17985
|
-
|
17986
|
-
|
17987
|
-
|
17988
|
-
|
17989
|
-
|
17990
|
-
|
17991
|
-
|
17992
|
-
|
17993
|
-
|
17994
|
-
(
|
17995
|
-
|
17996
|
-
|
17997
|
-
|
18368
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
18369
|
+
// NOTE: we accept all unsupported token types,
|
18370
|
+
// suppressing them like CONTROL tokens.
|
18371
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
18372
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18373
|
+
result = llama_decode_text(result);
|
18374
|
+
if (length < (int) result.length()) {
|
18375
|
+
return -(int) result.length();
|
18376
|
+
}
|
18377
|
+
memcpy(buf, result.c_str(), result.length());
|
18378
|
+
return result.length();
|
18379
|
+
} else if (
|
18380
|
+
(llama_is_user_defined_token(model->vocab, token)) ||
|
18381
|
+
(llama_is_control_token (model->vocab, token) && special)) {
|
18382
|
+
std::string result = model->vocab.id_to_token[token].text;
|
18383
|
+
if (length < (int) result.length()) {
|
18384
|
+
return -(int) result.length();
|
18385
|
+
}
|
18386
|
+
memcpy(buf, result.c_str(), result.length());
|
18387
|
+
return result.length();
|
17998
18388
|
}
|
17999
|
-
|
18000
|
-
return result.length();
|
18389
|
+
break;
|
18001
18390
|
}
|
18002
|
-
|
18003
|
-
|
18004
|
-
default:
|
18005
|
-
GGML_ASSERT(false);
|
18391
|
+
default:
|
18392
|
+
GGML_ASSERT(false);
|
18006
18393
|
}
|
18007
18394
|
}
|
18008
18395
|
return 0;
|
@@ -18337,6 +18724,7 @@ const char * llama_print_system_info(void) {
|
|
18337
18724
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18338
18725
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18339
18726
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18727
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
18340
18728
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
18341
18729
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
18342
18730
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|