llama_cpp 0.15.3 → 0.15.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -103,7 +103,7 @@
103
103
  #endif
104
104
 
105
105
  #define LLAMA_MAX_NODES 8192
106
- #define LLAMA_MAX_EXPERTS 128
106
+ #define LLAMA_MAX_EXPERTS 160
107
107
 
108
108
  //
109
109
  // logging
@@ -222,6 +222,7 @@ enum llm_arch {
222
222
  LLM_ARCH_DBRX,
223
223
  LLM_ARCH_OLMO,
224
224
  LLM_ARCH_ARCTIC,
225
+ LLM_ARCH_DEEPSEEK2,
225
226
  LLM_ARCH_UNKNOWN,
226
227
  };
227
228
 
@@ -259,6 +260,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259
260
  { LLM_ARCH_DBRX, "dbrx" },
260
261
  { LLM_ARCH_OLMO, "olmo" },
261
262
  { LLM_ARCH_ARCTIC, "arctic" },
263
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
262
264
  { LLM_ARCH_UNKNOWN, "(unknown)" },
263
265
  };
264
266
 
@@ -279,11 +281,15 @@ enum llm_kv {
279
281
  LLM_KV_CONTEXT_LENGTH,
280
282
  LLM_KV_EMBEDDING_LENGTH,
281
283
  LLM_KV_BLOCK_COUNT,
284
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
282
285
  LLM_KV_FEED_FORWARD_LENGTH,
286
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
283
287
  LLM_KV_USE_PARALLEL_RESIDUAL,
284
288
  LLM_KV_TENSOR_DATA_LAYOUT,
285
289
  LLM_KV_EXPERT_COUNT,
286
290
  LLM_KV_EXPERT_USED_COUNT,
291
+ LLM_KV_EXPERT_SHARED_COUNT,
292
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
287
293
  LLM_KV_POOLING_TYPE,
288
294
  LLM_KV_LOGIT_SCALE,
289
295
 
@@ -296,6 +302,8 @@ enum llm_kv {
296
302
  LLM_KV_ATTENTION_LAYERNORM_EPS,
297
303
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
298
304
  LLM_KV_ATTENTION_CAUSAL,
305
+ LLM_KV_ATTENTION_Q_LORA_RANK,
306
+ LLM_KV_ATTENTION_KV_LORA_RANK,
299
307
 
300
308
  LLM_KV_ROPE_DIMENSION_COUNT,
301
309
  LLM_KV_ROPE_FREQ_BASE,
@@ -305,6 +313,7 @@ enum llm_kv {
305
313
  LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
314
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
315
  LLM_KV_ROPE_SCALING_FINETUNED,
316
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
308
317
 
309
318
  LLM_KV_SPLIT_NO,
310
319
  LLM_KV_SPLIT_COUNT,
@@ -353,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
353
362
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
354
363
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
355
364
 
356
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
357
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
358
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
359
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
360
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
361
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
362
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
363
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
364
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
365
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
366
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
366
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
367
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
368
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
369
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
370
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
371
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
372
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
373
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
374
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
375
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
376
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
377
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
378
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
379
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
367
380
 
368
381
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
369
382
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -374,6 +387,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
374
387
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
375
388
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
376
389
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
390
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
391
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
377
392
 
378
393
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
379
394
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -383,6 +398,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
383
398
  { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
384
399
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
385
400
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
401
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
386
402
 
387
403
  { LLM_KV_SPLIT_NO, "split.no" },
388
404
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -474,6 +490,12 @@ enum llm_tensor {
474
490
  LLM_TENSOR_SSM_A,
475
491
  LLM_TENSOR_SSM_D,
476
492
  LLM_TENSOR_SSM_OUT,
493
+ LLM_TENSOR_ATTN_Q_A,
494
+ LLM_TENSOR_ATTN_Q_B,
495
+ LLM_TENSOR_ATTN_KV_A_MQA,
496
+ LLM_TENSOR_ATTN_KV_B,
497
+ LLM_TENSOR_ATTN_Q_A_NORM,
498
+ LLM_TENSOR_ATTN_KV_A_NORM,
477
499
  };
478
500
 
479
501
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1057,6 +1079,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1057
1079
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
1080
  },
1059
1081
  },
1082
+ {
1083
+ LLM_ARCH_DEEPSEEK2,
1084
+ {
1085
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1086
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1087
+ { LLM_TENSOR_OUTPUT, "output" },
1088
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1089
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1090
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1091
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1092
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1093
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1094
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1095
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1096
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1097
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1098
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1099
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1100
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1101
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1102
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1103
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1104
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1105
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1106
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1107
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1108
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1109
+ },
1110
+ },
1060
1111
  {
1061
1112
  LLM_ARCH_UNKNOWN,
1062
1113
  {
@@ -1651,12 +1702,13 @@ struct llama_mlock {
1651
1702
  };
1652
1703
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1653
1704
 
1654
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1705
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1706
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1655
1707
  std::vector<char> result(8, 0);
1656
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1708
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1657
1709
  if (n_tokens < 0) {
1658
1710
  result.resize(-n_tokens);
1659
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1711
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1660
1712
  GGML_ASSERT(check == -n_tokens);
1661
1713
  }
1662
1714
  else {
@@ -1741,6 +1793,7 @@ enum e_model {
1741
1793
  MODEL_13B,
1742
1794
  MODEL_14B,
1743
1795
  MODEL_15B,
1796
+ MODEL_16B,
1744
1797
  MODEL_20B,
1745
1798
  MODEL_30B,
1746
1799
  MODEL_34B,
@@ -1748,6 +1801,7 @@ enum e_model {
1748
1801
  MODEL_40B,
1749
1802
  MODEL_65B,
1750
1803
  MODEL_70B,
1804
+ MODEL_236B,
1751
1805
  MODEL_314B,
1752
1806
  MODEL_SMALL,
1753
1807
  MODEL_MEDIUM,
@@ -1783,6 +1837,13 @@ struct llama_hparams {
1783
1837
  uint32_t n_expert_used = 0;
1784
1838
  uint32_t n_vocab_type = 0; // for BERT-style token types
1785
1839
 
1840
+ uint32_t n_layer_dense_lead = 0;
1841
+ uint32_t n_lora_q = 0;
1842
+ uint32_t n_lora_kv = 0;
1843
+ uint32_t n_ff_exp = 0;
1844
+ uint32_t n_expert_shared = 0;
1845
+ float expert_weights_scale = 0.0;
1846
+
1786
1847
  float f_norm_eps;
1787
1848
  float f_norm_rms_eps;
1788
1849
 
@@ -1790,6 +1851,7 @@ struct llama_hparams {
1790
1851
  float rope_freq_base_train;
1791
1852
  float rope_freq_scale_train;
1792
1853
  uint32_t n_yarn_orig_ctx;
1854
+ float rope_yarn_log_mul;
1793
1855
 
1794
1856
  // for State Space Models
1795
1857
  uint32_t ssm_d_conv = 0;
@@ -1823,6 +1885,12 @@ struct llama_hparams {
1823
1885
  if (this->n_expert != other.n_expert) return true;
1824
1886
  if (this->n_expert_used != other.n_expert_used) return true;
1825
1887
 
1888
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1889
+ if (this->n_lora_q != other.n_lora_q) return true;
1890
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1891
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1892
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1893
+
1826
1894
  if (this->rope_finetuned != other.rope_finetuned) return true;
1827
1895
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1828
1896
 
@@ -1838,6 +1906,8 @@ struct llama_hparams {
1838
1906
  if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1839
1907
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1840
1908
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1909
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1910
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1841
1911
 
1842
1912
  return false;
1843
1913
  }
@@ -1913,6 +1983,8 @@ struct llama_layer {
1913
1983
  struct ggml_tensor * attn_k_norm_b;
1914
1984
  struct ggml_tensor * attn_out_norm;
1915
1985
  struct ggml_tensor * attn_out_norm_b;
1986
+ struct ggml_tensor * attn_q_a_norm;
1987
+ struct ggml_tensor * attn_kv_a_norm;
1916
1988
 
1917
1989
  // attention
1918
1990
  struct ggml_tensor * wq;
@@ -1920,6 +1992,10 @@ struct llama_layer {
1920
1992
  struct ggml_tensor * wv;
1921
1993
  struct ggml_tensor * wo;
1922
1994
  struct ggml_tensor * wqkv;
1995
+ struct ggml_tensor * wq_a;
1996
+ struct ggml_tensor * wq_b;
1997
+ struct ggml_tensor * wkv_a_mqa;
1998
+ struct ggml_tensor * wkv_b;
1923
1999
 
1924
2000
  // attention bias
1925
2001
  struct ggml_tensor * bq;
@@ -1953,8 +2029,9 @@ struct llama_layer {
1953
2029
  struct ggml_tensor * ffn_up_shexp;
1954
2030
 
1955
2031
  // ff bias
1956
- struct ggml_tensor * ffn_down_b; // b2
1957
- struct ggml_tensor * ffn_up_b; // b3
2032
+ struct ggml_tensor * ffn_gate_b = nullptr;
2033
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2034
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1958
2035
  struct ggml_tensor * ffn_act;
1959
2036
 
1960
2037
  // mamba proj
@@ -2086,7 +2163,9 @@ struct llama_vocab {
2086
2163
  std::unordered_map<token, id> token_to_id;
2087
2164
  std::vector<token_data> id_to_token;
2088
2165
 
2089
- std::unordered_map<token, id> special_tokens_cache;
2166
+ std::vector<id> cache_special_tokens;
2167
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
+ std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2090
2169
 
2091
2170
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2092
2171
 
@@ -3832,6 +3911,7 @@ static const char * llama_model_type_name(e_model type) {
3832
3911
  case MODEL_13B: return "13B";
3833
3912
  case MODEL_14B: return "14B";
3834
3913
  case MODEL_15B: return "15B";
3914
+ case MODEL_16B: return "16B";
3835
3915
  case MODEL_20B: return "20B";
3836
3916
  case MODEL_30B: return "30B";
3837
3917
  case MODEL_34B: return "34B";
@@ -3839,6 +3919,7 @@ static const char * llama_model_type_name(e_model type) {
3839
3919
  case MODEL_40B: return "40B";
3840
3920
  case MODEL_65B: return "65B";
3841
3921
  case MODEL_70B: return "70B";
3922
+ case MODEL_236B: return "236B";
3842
3923
  case MODEL_314B: return "314B";
3843
3924
  case MODEL_SMALL: return "0.1B";
3844
3925
  case MODEL_MEDIUM: return "0.4B";
@@ -3981,7 +4062,9 @@ static void llm_load_hparams(
3981
4062
  switch (hparams.n_layer) {
3982
4063
  case 22: model.type = e_model::MODEL_1B; break;
3983
4064
  case 26: model.type = e_model::MODEL_3B; break;
3984
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4065
+ // granite uses a vocab with len 49152
4066
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4067
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3985
4068
  case 40: model.type = e_model::MODEL_13B; break;
3986
4069
  case 48: model.type = e_model::MODEL_34B; break;
3987
4070
  case 60: model.type = e_model::MODEL_30B; break;
@@ -4251,6 +4334,8 @@ static void llm_load_hparams(
4251
4334
  case 30: model.type = e_model::MODEL_3B; break;
4252
4335
  case 32: model.type = e_model::MODEL_7B; break;
4253
4336
  case 40: model.type = e_model::MODEL_15B; break;
4337
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4338
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4254
4339
  default: model.type = e_model::MODEL_UNKNOWN;
4255
4340
  }
4256
4341
  } break;
@@ -4384,6 +4469,26 @@ static void llm_load_hparams(
4384
4469
  model.type = e_model::MODEL_UNKNOWN;
4385
4470
  }
4386
4471
  } break;
4472
+ case LLM_ARCH_DEEPSEEK2:
4473
+ {
4474
+ bool is_lite = (hparams.n_layer == 27);
4475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4477
+ if (!is_lite) {
4478
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4479
+ }
4480
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4481
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4482
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4483
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4484
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4485
+
4486
+ switch (hparams.n_layer) {
4487
+ case 27: model.type = e_model::MODEL_16B; break;
4488
+ case 60: model.type = e_model::MODEL_236B; break;
4489
+ default: model.type = e_model::MODEL_UNKNOWN;
4490
+ }
4491
+ } break;
4387
4492
  default: (void)0;
4388
4493
  }
4389
4494
 
@@ -4490,15 +4595,14 @@ static void llm_load_vocab(
4490
4595
  vocab.special_cls_id = 101;
4491
4596
  vocab.special_mask_id = 103;
4492
4597
  vocab.add_space_prefix = false;
4493
- } else {
4494
- if (tokenizer_model == "gpt2") {
4495
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4496
- } else {
4497
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4498
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4499
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4500
- return;
4598
+ } else if (tokenizer_model == "gpt2") {
4599
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4600
+
4601
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602
+ if (add_space_prefix_keyidx != -1) {
4603
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4501
4604
  }
4605
+
4502
4606
  // read bpe merges and populate bpe ranks
4503
4607
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4504
4608
  if (merges_keyidx == -1) {
@@ -4532,6 +4636,8 @@ static void llm_load_vocab(
4532
4636
  vocab.special_pad_id = -1;
4533
4637
  vocab.special_cls_id = -1;
4534
4638
  vocab.special_mask_id = -1;
4639
+ } else {
4640
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4535
4641
  }
4536
4642
 
4537
4643
  // for now, only BPE models have pre-tokenizers
@@ -4593,6 +4699,9 @@ static void llm_load_vocab(
4593
4699
  } else if (
4594
4700
  tokenizer_pre == "dbrx") {
4595
4701
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4702
+ } else if (
4703
+ tokenizer_pre == "smaug-bpe") {
4704
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4596
4705
  } else {
4597
4706
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4598
4707
  }
@@ -4721,97 +4830,40 @@ static void llm_load_vocab(
4721
4830
 
4722
4831
  // build special tokens cache
4723
4832
  {
4724
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4725
- // and will always be correctly labeled in 'added_tokens.json' etc.
4726
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4727
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4728
- // are special tokens.
4729
- // From testing, this appears to correlate 1:1 with special tokens.
4730
- //
4731
-
4732
- // Counting special tokens and verifying in only one direction
4733
- // is sufficient to detect difference in those two sets.
4734
- //
4735
- uint32_t special_tokens_count_by_type = 0;
4736
- uint32_t special_tokens_count_from_verification = 0;
4737
-
4738
- bool special_tokens_definition_mismatch = false;
4739
-
4740
- for (const auto & t : vocab.token_to_id) {
4741
- const auto & token = t.first;
4742
- const auto & id = t.second;
4743
-
4744
- // Count all non-normal tokens in the vocab while iterating
4833
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4745
4834
  if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4746
- special_tokens_count_by_type++;
4835
+ vocab.cache_special_tokens.push_back(id);
4747
4836
  }
4837
+ }
4748
4838
 
4749
- // Skip single character tokens
4750
- if (token.length() > 1) {
4751
- bool is_tokenizable = false;
4752
-
4753
- // Split token string representation in two, in all possible ways
4754
- // and check if both halves can be matched to a valid token
4755
- for (unsigned i = 1; i < token.length();) {
4756
- const auto left = token.substr(0, i);
4757
- const auto right = token.substr(i);
4758
-
4759
- // check if we didnt partition in the middle of a utf sequence
4760
- auto utf = utf8_len(left.at(left.length() - 1));
4761
-
4762
- if (utf == 1) {
4763
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4764
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4765
- is_tokenizable = true;
4766
- break;
4767
- }
4768
- i++;
4769
- } else {
4770
- // skip over the rest of multibyte utf sequence
4771
- i += utf - 1;
4772
- }
4773
- }
4839
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4840
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4841
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4842
+ }
4843
+ );
4774
4844
 
4775
- if (!is_tokenizable) {
4776
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4777
- // it's faster to re-filter them here, since there are way less candidates now
4845
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
+ }
4778
4847
 
4779
- // Calculate a total "utf" length of a token string representation
4780
- size_t utf8_str_len = 0;
4781
- for (unsigned i = 0; i < token.length();) {
4782
- utf8_str_len++;
4783
- i += utf8_len(token.at(i));
4784
- }
4848
+ // build token to piece caches
4849
+ {
4850
+ size_t size_cache = 0;
4785
4851
 
4786
- // And skip the ones which are one character
4787
- if (utf8_str_len > 1) {
4788
- // At this point what we have left are special tokens only
4789
- vocab.special_tokens_cache[token] = id;
4852
+ std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
+ std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4790
4854
 
4791
- // Count manually found special tokens
4792
- special_tokens_count_from_verification++;
4855
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4856
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
+ cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4793
4858
 
4794
- // If this manually found special token is not marked as such, flag a mismatch
4795
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4796
- special_tokens_definition_mismatch = true;
4797
- }
4798
- }
4799
- }
4800
- }
4859
+ size_cache += cache_token_to_piece[id].size();
4860
+ size_cache += cache_token_to_piece_special[id].size();
4801
4861
  }
4802
4862
 
4803
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4804
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4805
- __func__,
4806
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4807
- special_tokens_count_by_type, vocab.id_to_token.size()
4808
- );
4809
- } else {
4810
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4811
- __func__,
4812
- special_tokens_count_from_verification, vocab.id_to_token.size()
4813
- );
4814
- }
4863
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
+ std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4865
+
4866
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4815
4867
  }
4816
4868
  }
4817
4869
 
@@ -4892,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4892
4944
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4893
4945
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4894
4946
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4947
+
4948
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
4949
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4950
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
4951
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4952
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4953
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4954
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4955
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
4956
+ }
4895
4957
  }
4896
4958
 
4897
4959
  // Returns false if cancelled by progress_callback
@@ -5048,8 +5110,6 @@ static bool llm_load_tensors(
5048
5110
  throw std::runtime_error("model has expert layers but no expert layers are used");
5049
5111
  }
5050
5112
 
5051
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
5052
-
5053
5113
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
5054
5114
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
5055
5115
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -5103,6 +5163,11 @@ static bool llm_load_tensors(
5103
5163
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5104
5164
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5105
5165
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5166
+
5167
+ // optional MLP bias
5168
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5169
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5170
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5106
5171
  } else {
5107
5172
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5108
5173
 
@@ -6210,6 +6275,70 @@ static bool llm_load_tensors(
6210
6275
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
6276
  }
6212
6277
  } break;
6278
+ case LLM_ARCH_DEEPSEEK2:
6279
+ {
6280
+ bool is_lite = (hparams.n_layer == 27);
6281
+
6282
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6283
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6284
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6285
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6286
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6287
+
6288
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6289
+
6290
+ // output
6291
+ {
6292
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6293
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6294
+ }
6295
+
6296
+ for (int i = 0; i < n_layer; ++i) {
6297
+ ggml_context * ctx_layer = ctx_for_layer(i);
6298
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6299
+
6300
+ auto & layer = model.layers[i];
6301
+
6302
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6303
+ if (!is_lite) {
6304
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6305
+ }
6306
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6307
+
6308
+ if (!is_lite) {
6309
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6310
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6311
+ } else {
6312
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6313
+ }
6314
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6315
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6316
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6317
+
6318
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6319
+
6320
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6321
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6322
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6323
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6324
+ } else {
6325
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6326
+
6327
+ GGML_ASSERT(hparams.n_expert > 0);
6328
+ GGML_ASSERT(hparams.n_expert_used > 0);
6329
+
6330
+ // MoE branch
6331
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6332
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6333
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6334
+
6335
+ // Shared expert branch
6336
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6337
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6338
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6339
+ }
6340
+ }
6341
+ } break;
6213
6342
  default:
6214
6343
  throw std::runtime_error("unknown architecture");
6215
6344
  }
@@ -6664,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6664
6793
  int64_t n_expert_used,
6665
6794
  llm_ffn_op_type type_op,
6666
6795
  bool norm_w,
6796
+ bool scale_w,
6797
+ float w_scale,
6667
6798
  const llm_build_cb & cb,
6668
6799
  int il) {
6669
6800
  int64_t n_embd = cur->ne[0];
@@ -6695,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6695
6826
 
6696
6827
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6697
6828
  }
6829
+ if (scale_w) {
6830
+ weights = ggml_scale(ctx, weights, w_scale);
6831
+ cb(weights, "ffn_moe_weights_scaled", il);
6832
+ }
6698
6833
 
6699
6834
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6700
6835
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -7305,9 +7440,9 @@ struct llm_build_context {
7305
7440
  cb(cur, "ffn_norm", il);
7306
7441
 
7307
7442
  cur = llm_build_ffn(ctx0, cur,
7308
- model.layers[il].ffn_up, NULL,
7309
- model.layers[il].ffn_gate, NULL,
7310
- model.layers[il].ffn_down, NULL,
7443
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7444
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7445
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7311
7446
  NULL,
7312
7447
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7313
7448
  cb(cur, "ffn_out", il);
@@ -7325,6 +7460,7 @@ struct llm_build_context {
7325
7460
  model.layers[il].ffn_down_exps,
7326
7461
  n_expert, n_expert_used,
7327
7462
  LLM_FFN_SILU, true,
7463
+ false, 0.0,
7328
7464
  cb, il);
7329
7465
  cb(cur, "ffn_moe_out", il);
7330
7466
  }
@@ -7806,6 +7942,7 @@ struct llm_build_context {
7806
7942
  model.layers[il].ffn_down_exps,
7807
7943
  n_expert, n_expert_used,
7808
7944
  LLM_FFN_GELU, true,
7945
+ false, 0.0,
7809
7946
  cb, il);
7810
7947
  cb(cur, "ffn_moe_out", il);
7811
7948
 
@@ -7949,6 +8086,7 @@ struct llm_build_context {
7949
8086
  model.layers[il].ffn_down_exps,
7950
8087
  n_expert, n_expert_used,
7951
8088
  LLM_FFN_SILU, true,
8089
+ false, 0.0,
7952
8090
  cb, il);
7953
8091
  cb(cur, "ffn_moe_out", il);
7954
8092
 
@@ -9087,6 +9225,7 @@ struct llm_build_context {
9087
9225
  model.layers[il].ffn_down_exps,
9088
9226
  n_expert, n_expert_used,
9089
9227
  LLM_FFN_SILU, false,
9228
+ false, 0.0,
9090
9229
  cb, il);
9091
9230
  cb(cur, "ffn_moe_out", il);
9092
9231
 
@@ -10974,6 +11113,7 @@ struct llm_build_context {
10974
11113
  model.layers[il].ffn_down_exps,
10975
11114
  n_expert, n_expert_used,
10976
11115
  LLM_FFN_SILU, true,
11116
+ false, 0.0,
10977
11117
  cb, il);
10978
11118
  cb(cur, "ffn_moe_out", il);
10979
11119
 
@@ -11005,6 +11145,239 @@ struct llm_build_context {
11005
11145
 
11006
11146
  return gf;
11007
11147
  }
11148
+
11149
+ struct ggml_cgraph * build_deepseek2() {
11150
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11151
+
11152
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11153
+ int32_t n_tokens = this->n_tokens;
11154
+
11155
+ bool is_lite = (hparams.n_layer == 27);
11156
+
11157
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11158
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11159
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11160
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11161
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11162
+
11163
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11164
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11165
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11166
+
11167
+ struct ggml_tensor * cur;
11168
+ struct ggml_tensor * inpL;
11169
+
11170
+ // {n_embd, n_tokens}
11171
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11172
+
11173
+ // inp_pos - contains the positions
11174
+ struct ggml_tensor * inp_pos = build_inp_pos();
11175
+
11176
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11177
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11178
+
11179
+ for (int il = 0; il < n_layer; ++il) {
11180
+ struct ggml_tensor * inpSA = inpL;
11181
+
11182
+ // norm
11183
+ cur = llm_build_norm(ctx0, inpL, hparams,
11184
+ model.layers[il].attn_norm, NULL,
11185
+ LLM_NORM_RMS, cb, il);
11186
+ cb(cur, "attn_norm", il);
11187
+
11188
+ // self_attention
11189
+ {
11190
+ struct ggml_tensor * q = NULL;
11191
+ if (!is_lite) {
11192
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11193
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11194
+ cb(q, "q", il);
11195
+
11196
+ q = llm_build_norm(ctx0, q, hparams,
11197
+ model.layers[il].attn_q_a_norm, NULL,
11198
+ LLM_NORM_RMS, cb, il);
11199
+ cb(q, "q", il);
11200
+
11201
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11202
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11203
+ cb(q, "q", il);
11204
+ } else {
11205
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11206
+ cb(q, "q", il);
11207
+ }
11208
+
11209
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11210
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11211
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11212
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11213
+ 0);
11214
+ cb(q_nope, "q_nope", il);
11215
+
11216
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11217
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11218
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11219
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11220
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11221
+ cb(q_pe, "q_pe", il);
11222
+
11223
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11224
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11225
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11226
+
11227
+ // split into {kv_lora_rank, n_tokens}
11228
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11229
+ kv_pe_compresseed->nb[1],
11230
+ 0);
11231
+ cb(kv_compressed, "kv_compressed", il);
11232
+
11233
+ // and {n_embd_head_qk_rope, n_tokens}
11234
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11235
+ kv_pe_compresseed->nb[1],
11236
+ kv_pe_compresseed->nb[1],
11237
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11238
+ cb(k_pe, "k_pe", il);
11239
+
11240
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11241
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11242
+ model.layers[il].attn_kv_a_norm, NULL,
11243
+ LLM_NORM_RMS, cb, il);
11244
+ cb(kv_compressed, "kv_compressed", il);
11245
+
11246
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11247
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11248
+ cb(kv, "kv", il);
11249
+
11250
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11251
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11252
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11253
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11254
+ 0);
11255
+ cb(k_nope, "k_nope", il);
11256
+
11257
+ // and {n_head * n_embd_head_v, n_tokens}
11258
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11259
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11260
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11261
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11262
+ cb(v_states, "v_states", il);
11263
+
11264
+ v_states = ggml_cont(ctx0, v_states);
11265
+ cb(v_states, "v_states", il);
11266
+
11267
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11268
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11269
+ 0);
11270
+ cb(v_states, "v_states", il);
11271
+
11272
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
+ q_pe = ggml_rope_ext(
11274
+ ctx0, q_pe, inp_pos, nullptr,
11275
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11276
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
+ );
11278
+ cb(q_pe, "q_pe", il);
11279
+
11280
+ // shared RoPE key
11281
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
+ k_pe = ggml_rope_ext(
11283
+ ctx0, k_pe, inp_pos, nullptr,
11284
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11285
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
+ );
11287
+ cb(k_pe, "k_pe", il);
11288
+
11289
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11290
+ cb(q_states, "q_states", il);
11291
+
11292
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11293
+ cb(k_states, "k_states", il);
11294
+
11295
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11296
+ model.layers[il].wo, NULL,
11297
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11298
+ }
11299
+
11300
+ if (il == n_layer - 1) {
11301
+ // skip computing output for unused tokens
11302
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11303
+ n_tokens = n_outputs;
11304
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11305
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11306
+ }
11307
+
11308
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11309
+ cb(ffn_inp, "ffn_inp", il);
11310
+
11311
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11312
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11313
+ model.layers[il].ffn_norm, NULL,
11314
+ LLM_NORM_RMS, cb, il);
11315
+ cb(cur, "ffn_norm", il);
11316
+
11317
+ cur = llm_build_ffn(ctx0, cur,
11318
+ model.layers[il].ffn_up, NULL,
11319
+ model.layers[il].ffn_gate, NULL,
11320
+ model.layers[il].ffn_down, NULL,
11321
+ NULL,
11322
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11323
+ cb(cur, "ffn_out", il);
11324
+ } else {
11325
+ // MoE branch
11326
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11327
+ model.layers[il].ffn_norm, NULL,
11328
+ LLM_NORM_RMS, cb, il);
11329
+ cb(cur, "ffn_norm", il);
11330
+
11331
+ ggml_tensor * moe_out =
11332
+ llm_build_moe_ffn(ctx0, cur,
11333
+ model.layers[il].ffn_gate_inp,
11334
+ model.layers[il].ffn_up_exps,
11335
+ model.layers[il].ffn_gate_exps,
11336
+ model.layers[il].ffn_down_exps,
11337
+ n_expert, n_expert_used,
11338
+ LLM_FFN_SILU, false,
11339
+ true, hparams.expert_weights_scale,
11340
+ cb, il);
11341
+ cb(moe_out, "ffn_moe_out", il);
11342
+
11343
+ // FFN shared expert
11344
+ {
11345
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11346
+ model.layers[il].ffn_up_shexp, NULL,
11347
+ model.layers[il].ffn_gate_shexp, NULL,
11348
+ model.layers[il].ffn_down_shexp, NULL,
11349
+ NULL,
11350
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11351
+ cb(ffn_shexp, "ffn_shexp", il);
11352
+
11353
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11354
+ cb(cur, "ffn_out", il);
11355
+ }
11356
+ }
11357
+
11358
+ cur = ggml_add(ctx0, cur, ffn_inp);
11359
+ cb(cur, "l_out", il);
11360
+
11361
+ // input for next layer
11362
+ inpL = cur;
11363
+ }
11364
+
11365
+ cur = inpL;
11366
+
11367
+ cur = llm_build_norm(ctx0, cur, hparams,
11368
+ model.output_norm, NULL,
11369
+ LLM_NORM_RMS, cb, -1);
11370
+ cb(cur, "result_norm", -1);
11371
+
11372
+ // lm_head
11373
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11374
+ cb(cur, "result_output", -1);
11375
+
11376
+ ggml_build_forward_expand(gf, cur);
11377
+
11378
+ return gf;
11379
+ }
11380
+
11008
11381
  };
11009
11382
 
11010
11383
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -11223,6 +11596,10 @@ static struct ggml_cgraph * llama_build_graph(
11223
11596
  {
11224
11597
  result = llm.build_arctic();
11225
11598
  } break;
11599
+ case LLM_ARCH_DEEPSEEK2:
11600
+ {
11601
+ result = llm.build_deepseek2();
11602
+ } break;
11226
11603
  default:
11227
11604
  GGML_ASSERT(false);
11228
11605
  }
@@ -12512,6 +12889,7 @@ struct llm_tokenizer_bpe {
12512
12889
  });
12513
12890
  break;
12514
12891
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12892
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12515
12893
  word_collection = unicode_regex_split(text, {
12516
12894
  // same as llama3
12517
12895
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12734,7 +13112,7 @@ struct llm_tokenizer_wpm {
12734
13112
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12735
13113
 
12736
13114
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12737
- auto * token_map = &vocab.token_to_id;
13115
+ const auto & token_map = vocab.token_to_id;
12738
13116
 
12739
13117
  // normalize and split by whitespace
12740
13118
  std::vector<std::string> words = preprocess(text);
@@ -12749,108 +13127,89 @@ struct llm_tokenizer_wpm {
12749
13127
  }
12750
13128
 
12751
13129
  // prepend phantom space
12752
- std::string word1 = "\xe2\x96\x81" + word;
12753
- int n = word1.size();
13130
+ const std::string word1 = "\xe2\x96\x81" + word;
13131
+ const int n = word1.size();
12754
13132
 
12755
- // we're at the start of a new word
12756
- int i = 0;
12757
- bool match_any = false;
13133
+ const size_t current_tokens = output.size();
12758
13134
 
13135
+ // we're at the start of a new word
12759
13136
  // move through character position in word
12760
- while (i < n) {
13137
+ for (int i = 0; i < n; ++i) {
12761
13138
  // loop through possible match length
12762
13139
  bool match = false;
12763
13140
  for (int j = n; j > i; j--) {
12764
- auto it = token_map->find(word1.substr(i, j - i));
12765
- if (it != token_map->end()) {
13141
+ auto it = token_map.find(word1.substr(i, j - i));
13142
+ if (it != token_map.end()) {
12766
13143
  output.push_back(it->second);
12767
13144
  match = true;
12768
- match_any = true;
12769
- i = j;
13145
+ i = j - 1;
12770
13146
  break;
12771
13147
  }
12772
13148
  }
12773
13149
 
12774
- // must be an unknown character
12775
- if (!match) {
12776
- i++;
13150
+ if (!match) { // discard all
13151
+ output.resize(current_tokens);
13152
+ break; // and discard next tokens
12777
13153
  }
12778
13154
  }
12779
13155
 
12780
13156
  // we didn't find any matches for this word
12781
- if (!match_any) {
13157
+ if (current_tokens == output.size()) {
12782
13158
  output.push_back(vocab.special_unk_id);
12783
13159
  }
12784
13160
  }
12785
13161
  }
12786
13162
 
12787
13163
  std::vector<std::string> preprocess(const std::string & text) {
12788
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12789
-
12790
- // strip accents, strip control, uniformize whitespace,
12791
- // to lowercase, pad chinese characters, pad punctuation
12792
- std::string new_str = "";
12793
- for (uint32_t code : cpts_nfd) {
12794
- const codepoint_flags flags = unicode_cpt_flags(code);
12795
- if (flags.is_accent_mark || flags.is_control) {
13164
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13165
+ std::vector<std::string> words(1, "");
13166
+
13167
+ for (const char32_t cpt : cpts_nfd) {
13168
+ const auto flags = unicode_cpt_flags(cpt);
13169
+
13170
+ if (flags.is_whitespace) {
13171
+ if (words.back().size()) { // finish previous word if any
13172
+ words.emplace_back();
13173
+ }
12796
13174
  continue;
12797
13175
  }
12798
- code = unicode_tolower(code);
12799
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12800
- code = ' ';
12801
- }
12802
- std::string s = unicode_cpt_to_utf8(code);
12803
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12804
- new_str += " ";
12805
- new_str += s;
12806
- new_str += " ";
12807
- } else {
12808
- new_str += s;
13176
+
13177
+ assert (!flags.is_separator);
13178
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13179
+ continue;
12809
13180
  }
12810
- }
12811
13181
 
12812
- // split by whitespace
12813
- uint64_t l = 0;
12814
- uint64_t r = 0;
12815
- std::vector<std::string> words;
12816
- while (r < new_str.size()) {
12817
- // if is whitespace
12818
- if (isspace(new_str[r], std::locale::classic())) {
12819
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12820
- l = r + 1;
12821
- r = l;
13182
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13183
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13184
+ if (words.back().size()) { // finish previous word if any
13185
+ words.emplace_back();
13186
+ }
13187
+ words.back() = s; // single char word
13188
+ words.emplace_back(); // start a new word
12822
13189
  } else {
12823
- r += 1;
13190
+ words.back() += s; // append char to word
12824
13191
  }
12825
13192
  }
12826
- if (r > l) {
12827
- words.push_back(new_str.substr(l, (r - l)));
12828
- }
12829
- return words;
12830
- }
12831
13193
 
12832
- bool is_ascii_punct(uint32_t code) {
12833
- if (code > 0xFF) {
12834
- return false;
13194
+ if (!words.back().size()) {
13195
+ words.pop_back();
12835
13196
  }
12836
- auto c = char(static_cast<unsigned char>(code));
12837
- return ispunct(c, std::locale::classic());
13197
+
13198
+ return words;
12838
13199
  }
12839
13200
 
12840
- bool is_chinese_char(uint32_t cpt) {
12841
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12842
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13201
+ static bool is_chinese_char(uint32_t cpt) {
13202
+ return
13203
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13204
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12843
13205
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12844
13206
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12845
13207
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12846
13208
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12847
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12848
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12849
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12850
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12851
- return true; // NOLINT
12852
- }
12853
- return false;
13209
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13210
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13211
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13212
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12854
13213
  }
12855
13214
 
12856
13215
  const llama_vocab & vocab;
@@ -12894,9 +13253,8 @@ struct fragment_buffer_variant {
12894
13253
 
12895
13254
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12896
13255
  // for each special token
12897
- for (const auto & st: vocab.special_tokens_cache) {
12898
- const auto & special_token = st.first;
12899
- const auto & special_id = st.second;
13256
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
+ const auto & special_token = vocab.id_to_token[special_id].text;
12900
13258
 
12901
13259
  // for each text fragment
12902
13260
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12905,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12905
13263
 
12906
13264
  // if a fragment is text ( not yet processed )
12907
13265
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12908
- auto * raw_text = &(fragment.raw_text);
13266
+ auto & raw_text = fragment.raw_text;
12909
13267
 
12910
13268
  auto raw_text_base_offset = fragment.offset;
12911
13269
  auto raw_text_base_length = fragment.length;
@@ -12915,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12915
13273
  // find the first occurrence of a given special token in this fragment
12916
13274
  // passing offset argument only limit the "search area" but match coordinates
12917
13275
  // are still relative to the source full raw_text
12918
- auto match = raw_text->find(special_token, raw_text_base_offset);
13276
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12919
13277
 
12920
13278
  // no occurrences found, stop processing this fragment for a given special token
12921
13279
  if (match == std::string::npos) break;
@@ -12934,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12934
13292
  // left
12935
13293
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12936
13294
  const int64_t left_reminder_length = match - raw_text_base_offset;
12937
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13295
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
12938
13296
 
12939
13297
  #ifdef PRETOKENIZERDEBUG
12940
13298
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
@@ -12950,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12950
13308
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12951
13309
  const int64_t right_reminder_offset = match + special_token.length();
12952
13310
  const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12953
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13311
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
12954
13312
 
12955
13313
  #ifdef PRETOKENIZERDEBUG
12956
13314
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
@@ -14054,7 +14412,7 @@ void llama_sample_repetition_penalties(
14054
14412
 
14055
14413
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
14056
14414
  GGML_ASSERT(ctx);
14057
- const int64_t t_start_sample_us = ggml_time_us();
14415
+ int64_t t_start_sample_us = ggml_time_us();
14058
14416
 
14059
14417
  bool allow_eog = false;
14060
14418
  for (const auto & stack : grammar->stacks) {
@@ -14066,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
14066
14424
 
14067
14425
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
14068
14426
  candidates_decoded.reserve(candidates->size);
14069
- std::vector<llama_grammar_candidate> candidates_grammar;
14427
+
14428
+ std::vector<llama_grammar_candidate> candidates_grammar;
14070
14429
  candidates_grammar.reserve(candidates->size);
14071
14430
 
14072
14431
  for (size_t i = 0; i < candidates->size; ++i) {
14073
- const llama_token id = candidates->data[i].id;
14074
- const std::string piece = llama_token_to_piece(ctx, id, false);
14432
+ const llama_token id = candidates->data[i].id;
14433
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
14075
14434
 
14076
14435
  if (llama_token_is_eog(&ctx->model, id)) {
14077
14436
  if (!allow_eog) {
@@ -14271,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14271
14630
  GGML_ASSERT(false);
14272
14631
  }
14273
14632
 
14274
- const std::string piece = llama_token_to_piece(ctx, token, false);
14633
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14275
14634
 
14276
14635
  // Note terminating 0 in decoded string
14277
14636
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -16235,6 +16594,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16235
16594
  case LLM_ARCH_COMMAND_R:
16236
16595
  case LLM_ARCH_OLMO:
16237
16596
  case LLM_ARCH_ARCTIC:
16597
+ case LLM_ARCH_DEEPSEEK2:
16238
16598
  return LLAMA_ROPE_TYPE_NORM;
16239
16599
 
16240
16600
  // the pairs of head values are offset by n_rot/2
@@ -17861,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17861
18221
  );
17862
18222
  }
17863
18223
 
18224
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18225
+ return llama_is_control_token(model->vocab, token);
18226
+ }
18227
+
17864
18228
  llama_token llama_token_bos(const struct llama_model * model) {
17865
18229
  return model->vocab.special_bos_id;
17866
18230
  }
@@ -17932,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
17932
18296
 
17933
18297
  const auto cpts = unicode_cpts_from_utf8(text);
17934
18298
  for (const auto cpt : cpts) {
17935
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18299
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18300
+ try {
18301
+ decoded_text += unicode_utf8_to_byte(utf8);
18302
+ } catch (const std::out_of_range & e) {
18303
+ decoded_text += "[UNK_BYTE_0x";
18304
+ for (const auto c : utf8) {
18305
+ decoded_text += format("%02x", (uint8_t) c);
18306
+ }
18307
+ decoded_text += text + "]";
18308
+ }
17936
18309
  }
17937
18310
 
17938
18311
  return decoded_text;
@@ -17940,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
17940
18313
 
17941
18314
  // does not write null-terminator to buf
17942
18315
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18316
+ // if we have a cache - use it
18317
+ {
18318
+ const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18319
+
18320
+ if (!cache.empty()) {
18321
+ const auto & res = cache.at(token);
18322
+ if (length < (int) res.size()) {
18323
+ return -(int) res.size();
18324
+ }
18325
+ memcpy(buf, res.c_str(), res.size());
18326
+ return res.size();
18327
+ }
18328
+ }
18329
+
17943
18330
  if (0 <= token && token < llama_n_vocab(model)) {
17944
18331
  switch (llama_vocab_get_type(model->vocab)) {
17945
- case LLAMA_VOCAB_TYPE_WPM:
17946
- case LLAMA_VOCAB_TYPE_SPM: {
17947
- // NOTE: we accept all unsupported token types,
17948
- // suppressing them like CONTROL tokens.
17949
- if (llama_is_normal_token(model->vocab, token)) {
17950
- std::string result = model->vocab.id_to_token[token].text;
17951
- llama_unescape_whitespace(result);
17952
- if (length < (int) result.length()) {
17953
- return -(int) result.length();
17954
- }
17955
- memcpy(buf, result.c_str(), result.length());
17956
- return result.length();
17957
- } else if (
17958
- (llama_is_user_defined_token(model->vocab, token)) ||
17959
- (llama_is_control_token (model->vocab, token) && special)) {
17960
- std::string result = model->vocab.id_to_token[token].text;
17961
- if (length < (int) result.length()) {
17962
- return -(int) result.length();
17963
- }
17964
- memcpy(buf, result.c_str(), result.length());
17965
- return result.length();
17966
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17967
- if (length < 3) {
17968
- return -3;
17969
- }
17970
- memcpy(buf, "\xe2\x96\x85", 3);
17971
- return 3;
17972
- } else if (llama_is_byte_token(model->vocab, token)) {
17973
- if (length < 1) {
17974
- return -1;
18332
+ case LLAMA_VOCAB_TYPE_WPM:
18333
+ case LLAMA_VOCAB_TYPE_SPM: {
18334
+ // NOTE: we accept all unsupported token types,
18335
+ // suppressing them like CONTROL tokens.
18336
+ if (llama_is_normal_token(model->vocab, token)) {
18337
+ std::string result = model->vocab.id_to_token[token].text;
18338
+ llama_unescape_whitespace(result);
18339
+ if (length < (int) result.length()) {
18340
+ return -(int) result.length();
18341
+ }
18342
+ memcpy(buf, result.c_str(), result.length());
18343
+ return result.length();
18344
+ } else if (
18345
+ (llama_is_user_defined_token(model->vocab, token)) ||
18346
+ (llama_is_control_token (model->vocab, token) && special)) {
18347
+ std::string result = model->vocab.id_to_token[token].text;
18348
+ if (length < (int) result.length()) {
18349
+ return -(int) result.length();
18350
+ }
18351
+ memcpy(buf, result.c_str(), result.length());
18352
+ return result.length();
18353
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18354
+ if (length < 3) {
18355
+ return -3;
18356
+ }
18357
+ memcpy(buf, "\xe2\x96\x85", 3);
18358
+ return 3;
18359
+ } else if (llama_is_byte_token(model->vocab, token)) {
18360
+ if (length < 1) {
18361
+ return -1;
18362
+ }
18363
+ buf[0] = llama_token_to_byte(model->vocab, token);
18364
+ return 1;
17975
18365
  }
17976
- buf[0] = llama_token_to_byte(model->vocab, token);
17977
- return 1;
18366
+ break;
17978
18367
  }
17979
- break;
17980
- }
17981
- case LLAMA_VOCAB_TYPE_BPE: {
17982
- // NOTE: we accept all unsupported token types,
17983
- // suppressing them like CONTROL tokens.
17984
- if (llama_is_normal_token(model->vocab, token)) {
17985
- std::string result = model->vocab.id_to_token[token].text;
17986
- result = llama_decode_text(result);
17987
- if (length < (int) result.length()) {
17988
- return -(int) result.length();
17989
- }
17990
- memcpy(buf, result.c_str(), result.length());
17991
- return result.length();
17992
- } else if (
17993
- (llama_is_user_defined_token(model->vocab, token)) ||
17994
- (llama_is_control_token (model->vocab, token) && special)) {
17995
- std::string result = model->vocab.id_to_token[token].text;
17996
- if (length < (int) result.length()) {
17997
- return -(int) result.length();
18368
+ case LLAMA_VOCAB_TYPE_BPE: {
18369
+ // NOTE: we accept all unsupported token types,
18370
+ // suppressing them like CONTROL tokens.
18371
+ if (llama_is_normal_token(model->vocab, token)) {
18372
+ std::string result = model->vocab.id_to_token[token].text;
18373
+ result = llama_decode_text(result);
18374
+ if (length < (int) result.length()) {
18375
+ return -(int) result.length();
18376
+ }
18377
+ memcpy(buf, result.c_str(), result.length());
18378
+ return result.length();
18379
+ } else if (
18380
+ (llama_is_user_defined_token(model->vocab, token)) ||
18381
+ (llama_is_control_token (model->vocab, token) && special)) {
18382
+ std::string result = model->vocab.id_to_token[token].text;
18383
+ if (length < (int) result.length()) {
18384
+ return -(int) result.length();
18385
+ }
18386
+ memcpy(buf, result.c_str(), result.length());
18387
+ return result.length();
17998
18388
  }
17999
- memcpy(buf, result.c_str(), result.length());
18000
- return result.length();
18389
+ break;
18001
18390
  }
18002
- break;
18003
- }
18004
- default:
18005
- GGML_ASSERT(false);
18391
+ default:
18392
+ GGML_ASSERT(false);
18006
18393
  }
18007
18394
  }
18008
18395
  return 0;
@@ -18337,6 +18724,7 @@ const char * llama_print_system_info(void) {
18337
18724
  s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18338
18725
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18339
18726
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18727
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18340
18728
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18341
18729
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18342
18730
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";