llama_cpp 0.15.3 → 0.15.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -103,7 +103,7 @@
103
103
  #endif
104
104
 
105
105
  #define LLAMA_MAX_NODES 8192
106
- #define LLAMA_MAX_EXPERTS 128
106
+ #define LLAMA_MAX_EXPERTS 160
107
107
 
108
108
  //
109
109
  // logging
@@ -222,6 +222,7 @@ enum llm_arch {
222
222
  LLM_ARCH_DBRX,
223
223
  LLM_ARCH_OLMO,
224
224
  LLM_ARCH_ARCTIC,
225
+ LLM_ARCH_DEEPSEEK2,
225
226
  LLM_ARCH_UNKNOWN,
226
227
  };
227
228
 
@@ -259,6 +260,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259
260
  { LLM_ARCH_DBRX, "dbrx" },
260
261
  { LLM_ARCH_OLMO, "olmo" },
261
262
  { LLM_ARCH_ARCTIC, "arctic" },
263
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
262
264
  { LLM_ARCH_UNKNOWN, "(unknown)" },
263
265
  };
264
266
 
@@ -279,11 +281,15 @@ enum llm_kv {
279
281
  LLM_KV_CONTEXT_LENGTH,
280
282
  LLM_KV_EMBEDDING_LENGTH,
281
283
  LLM_KV_BLOCK_COUNT,
284
+ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
282
285
  LLM_KV_FEED_FORWARD_LENGTH,
286
+ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
283
287
  LLM_KV_USE_PARALLEL_RESIDUAL,
284
288
  LLM_KV_TENSOR_DATA_LAYOUT,
285
289
  LLM_KV_EXPERT_COUNT,
286
290
  LLM_KV_EXPERT_USED_COUNT,
291
+ LLM_KV_EXPERT_SHARED_COUNT,
292
+ LLM_KV_EXPERT_WEIGHTS_SCALE,
287
293
  LLM_KV_POOLING_TYPE,
288
294
  LLM_KV_LOGIT_SCALE,
289
295
 
@@ -296,6 +302,8 @@ enum llm_kv {
296
302
  LLM_KV_ATTENTION_LAYERNORM_EPS,
297
303
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
298
304
  LLM_KV_ATTENTION_CAUSAL,
305
+ LLM_KV_ATTENTION_Q_LORA_RANK,
306
+ LLM_KV_ATTENTION_KV_LORA_RANK,
299
307
 
300
308
  LLM_KV_ROPE_DIMENSION_COUNT,
301
309
  LLM_KV_ROPE_FREQ_BASE,
@@ -305,6 +313,7 @@ enum llm_kv {
305
313
  LLM_KV_ROPE_SCALING_ATTN_FACTOR,
306
314
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
307
315
  LLM_KV_ROPE_SCALING_FINETUNED,
316
+ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
308
317
 
309
318
  LLM_KV_SPLIT_NO,
310
319
  LLM_KV_SPLIT_COUNT,
@@ -353,17 +362,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
353
362
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
354
363
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
355
364
 
356
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
357
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
358
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
359
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
360
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
361
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
362
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
363
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
364
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
365
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
366
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
366
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
367
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
368
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
369
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
370
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
371
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
372
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
373
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
374
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
375
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
376
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
377
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
378
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
379
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
367
380
 
368
381
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
369
382
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -374,6 +387,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
374
387
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
375
388
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
376
389
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
390
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
391
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
377
392
 
378
393
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
379
394
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -383,6 +398,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
383
398
  { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
384
399
  { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
385
400
  { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
401
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
386
402
 
387
403
  { LLM_KV_SPLIT_NO, "split.no" },
388
404
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -474,6 +490,12 @@ enum llm_tensor {
474
490
  LLM_TENSOR_SSM_A,
475
491
  LLM_TENSOR_SSM_D,
476
492
  LLM_TENSOR_SSM_OUT,
493
+ LLM_TENSOR_ATTN_Q_A,
494
+ LLM_TENSOR_ATTN_Q_B,
495
+ LLM_TENSOR_ATTN_KV_A_MQA,
496
+ LLM_TENSOR_ATTN_KV_B,
497
+ LLM_TENSOR_ATTN_Q_A_NORM,
498
+ LLM_TENSOR_ATTN_KV_A_NORM,
477
499
  };
478
500
 
479
501
  static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -1057,6 +1079,35 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1057
1079
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1058
1080
  },
1059
1081
  },
1082
+ {
1083
+ LLM_ARCH_DEEPSEEK2,
1084
+ {
1085
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1086
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1087
+ { LLM_TENSOR_OUTPUT, "output" },
1088
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1089
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1090
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1091
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1092
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1093
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1094
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1095
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1096
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1097
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1098
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1099
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1100
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1101
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1102
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1103
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1104
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1105
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1106
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1107
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1108
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1109
+ },
1110
+ },
1060
1111
  {
1061
1112
  LLM_ARCH_UNKNOWN,
1062
1113
  {
@@ -1651,12 +1702,13 @@ struct llama_mlock {
1651
1702
  };
1652
1703
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1653
1704
 
1654
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1705
+ // NOTE: avoid ever using this except for building the token_to_piece caches
1706
+ static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
1655
1707
  std::vector<char> result(8, 0);
1656
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1708
+ const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
1657
1709
  if (n_tokens < 0) {
1658
1710
  result.resize(-n_tokens);
1659
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1711
+ int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
1660
1712
  GGML_ASSERT(check == -n_tokens);
1661
1713
  }
1662
1714
  else {
@@ -1741,6 +1793,7 @@ enum e_model {
1741
1793
  MODEL_13B,
1742
1794
  MODEL_14B,
1743
1795
  MODEL_15B,
1796
+ MODEL_16B,
1744
1797
  MODEL_20B,
1745
1798
  MODEL_30B,
1746
1799
  MODEL_34B,
@@ -1748,6 +1801,7 @@ enum e_model {
1748
1801
  MODEL_40B,
1749
1802
  MODEL_65B,
1750
1803
  MODEL_70B,
1804
+ MODEL_236B,
1751
1805
  MODEL_314B,
1752
1806
  MODEL_SMALL,
1753
1807
  MODEL_MEDIUM,
@@ -1783,6 +1837,13 @@ struct llama_hparams {
1783
1837
  uint32_t n_expert_used = 0;
1784
1838
  uint32_t n_vocab_type = 0; // for BERT-style token types
1785
1839
 
1840
+ uint32_t n_layer_dense_lead = 0;
1841
+ uint32_t n_lora_q = 0;
1842
+ uint32_t n_lora_kv = 0;
1843
+ uint32_t n_ff_exp = 0;
1844
+ uint32_t n_expert_shared = 0;
1845
+ float expert_weights_scale = 0.0;
1846
+
1786
1847
  float f_norm_eps;
1787
1848
  float f_norm_rms_eps;
1788
1849
 
@@ -1790,6 +1851,7 @@ struct llama_hparams {
1790
1851
  float rope_freq_base_train;
1791
1852
  float rope_freq_scale_train;
1792
1853
  uint32_t n_yarn_orig_ctx;
1854
+ float rope_yarn_log_mul;
1793
1855
 
1794
1856
  // for State Space Models
1795
1857
  uint32_t ssm_d_conv = 0;
@@ -1823,6 +1885,12 @@ struct llama_hparams {
1823
1885
  if (this->n_expert != other.n_expert) return true;
1824
1886
  if (this->n_expert_used != other.n_expert_used) return true;
1825
1887
 
1888
+ if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
1889
+ if (this->n_lora_q != other.n_lora_q) return true;
1890
+ if (this->n_lora_kv != other.n_lora_kv) return true;
1891
+ if (this->n_ff_exp != other.n_ff_exp) return true;
1892
+ if (this->n_expert_shared != other.n_expert_shared) return true;
1893
+
1826
1894
  if (this->rope_finetuned != other.rope_finetuned) return true;
1827
1895
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1828
1896
 
@@ -1838,6 +1906,8 @@ struct llama_hparams {
1838
1906
  if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
1839
1907
  if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1840
1908
  if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1909
+ if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
1910
+ if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
1841
1911
 
1842
1912
  return false;
1843
1913
  }
@@ -1913,6 +1983,8 @@ struct llama_layer {
1913
1983
  struct ggml_tensor * attn_k_norm_b;
1914
1984
  struct ggml_tensor * attn_out_norm;
1915
1985
  struct ggml_tensor * attn_out_norm_b;
1986
+ struct ggml_tensor * attn_q_a_norm;
1987
+ struct ggml_tensor * attn_kv_a_norm;
1916
1988
 
1917
1989
  // attention
1918
1990
  struct ggml_tensor * wq;
@@ -1920,6 +1992,10 @@ struct llama_layer {
1920
1992
  struct ggml_tensor * wv;
1921
1993
  struct ggml_tensor * wo;
1922
1994
  struct ggml_tensor * wqkv;
1995
+ struct ggml_tensor * wq_a;
1996
+ struct ggml_tensor * wq_b;
1997
+ struct ggml_tensor * wkv_a_mqa;
1998
+ struct ggml_tensor * wkv_b;
1923
1999
 
1924
2000
  // attention bias
1925
2001
  struct ggml_tensor * bq;
@@ -1953,8 +2029,9 @@ struct llama_layer {
1953
2029
  struct ggml_tensor * ffn_up_shexp;
1954
2030
 
1955
2031
  // ff bias
1956
- struct ggml_tensor * ffn_down_b; // b2
1957
- struct ggml_tensor * ffn_up_b; // b3
2032
+ struct ggml_tensor * ffn_gate_b = nullptr;
2033
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2034
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1958
2035
  struct ggml_tensor * ffn_act;
1959
2036
 
1960
2037
  // mamba proj
@@ -2086,7 +2163,9 @@ struct llama_vocab {
2086
2163
  std::unordered_map<token, id> token_to_id;
2087
2164
  std::vector<token_data> id_to_token;
2088
2165
 
2089
- std::unordered_map<token, id> special_tokens_cache;
2166
+ std::vector<id> cache_special_tokens;
2167
+ std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168
+ std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2090
2169
 
2091
2170
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2092
2171
 
@@ -3832,6 +3911,7 @@ static const char * llama_model_type_name(e_model type) {
3832
3911
  case MODEL_13B: return "13B";
3833
3912
  case MODEL_14B: return "14B";
3834
3913
  case MODEL_15B: return "15B";
3914
+ case MODEL_16B: return "16B";
3835
3915
  case MODEL_20B: return "20B";
3836
3916
  case MODEL_30B: return "30B";
3837
3917
  case MODEL_34B: return "34B";
@@ -3839,6 +3919,7 @@ static const char * llama_model_type_name(e_model type) {
3839
3919
  case MODEL_40B: return "40B";
3840
3920
  case MODEL_65B: return "65B";
3841
3921
  case MODEL_70B: return "70B";
3922
+ case MODEL_236B: return "236B";
3842
3923
  case MODEL_314B: return "314B";
3843
3924
  case MODEL_SMALL: return "0.1B";
3844
3925
  case MODEL_MEDIUM: return "0.4B";
@@ -3981,7 +4062,9 @@ static void llm_load_hparams(
3981
4062
  switch (hparams.n_layer) {
3982
4063
  case 22: model.type = e_model::MODEL_1B; break;
3983
4064
  case 26: model.type = e_model::MODEL_3B; break;
3984
- case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4065
+ // granite uses a vocab with len 49152
4066
+ case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4067
+ case 36: model.type = e_model::MODEL_8B; break; // granite
3985
4068
  case 40: model.type = e_model::MODEL_13B; break;
3986
4069
  case 48: model.type = e_model::MODEL_34B; break;
3987
4070
  case 60: model.type = e_model::MODEL_30B; break;
@@ -4251,6 +4334,8 @@ static void llm_load_hparams(
4251
4334
  case 30: model.type = e_model::MODEL_3B; break;
4252
4335
  case 32: model.type = e_model::MODEL_7B; break;
4253
4336
  case 40: model.type = e_model::MODEL_15B; break;
4337
+ case 52: model.type = e_model::MODEL_20B; break; // granite
4338
+ case 88: model.type = e_model::MODEL_34B; break; // granite
4254
4339
  default: model.type = e_model::MODEL_UNKNOWN;
4255
4340
  }
4256
4341
  } break;
@@ -4384,6 +4469,26 @@ static void llm_load_hparams(
4384
4469
  model.type = e_model::MODEL_UNKNOWN;
4385
4470
  }
4386
4471
  } break;
4472
+ case LLM_ARCH_DEEPSEEK2:
4473
+ {
4474
+ bool is_lite = (hparams.n_layer == 27);
4475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
4477
+ if (!is_lite) {
4478
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
4479
+ }
4480
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
4481
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
4482
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
4483
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
4484
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
4485
+
4486
+ switch (hparams.n_layer) {
4487
+ case 27: model.type = e_model::MODEL_16B; break;
4488
+ case 60: model.type = e_model::MODEL_236B; break;
4489
+ default: model.type = e_model::MODEL_UNKNOWN;
4490
+ }
4491
+ } break;
4387
4492
  default: (void)0;
4388
4493
  }
4389
4494
 
@@ -4490,15 +4595,14 @@ static void llm_load_vocab(
4490
4595
  vocab.special_cls_id = 101;
4491
4596
  vocab.special_mask_id = 103;
4492
4597
  vocab.add_space_prefix = false;
4493
- } else {
4494
- if (tokenizer_model == "gpt2") {
4495
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4496
- } else {
4497
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4498
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4499
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4500
- return;
4598
+ } else if (tokenizer_model == "gpt2") {
4599
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4600
+
4601
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602
+ if (add_space_prefix_keyidx != -1) {
4603
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4501
4604
  }
4605
+
4502
4606
  // read bpe merges and populate bpe ranks
4503
4607
  const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4504
4608
  if (merges_keyidx == -1) {
@@ -4532,6 +4636,8 @@ static void llm_load_vocab(
4532
4636
  vocab.special_pad_id = -1;
4533
4637
  vocab.special_cls_id = -1;
4534
4638
  vocab.special_mask_id = -1;
4639
+ } else {
4640
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4535
4641
  }
4536
4642
 
4537
4643
  // for now, only BPE models have pre-tokenizers
@@ -4593,6 +4699,9 @@ static void llm_load_vocab(
4593
4699
  } else if (
4594
4700
  tokenizer_pre == "dbrx") {
4595
4701
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4702
+ } else if (
4703
+ tokenizer_pre == "smaug-bpe") {
4704
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
4596
4705
  } else {
4597
4706
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4598
4707
  }
@@ -4721,97 +4830,40 @@ static void llm_load_vocab(
4721
4830
 
4722
4831
  // build special tokens cache
4723
4832
  {
4724
- // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
4725
- // and will always be correctly labeled in 'added_tokens.json' etc.
4726
- // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
4727
- // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
4728
- // are special tokens.
4729
- // From testing, this appears to correlate 1:1 with special tokens.
4730
- //
4731
-
4732
- // Counting special tokens and verifying in only one direction
4733
- // is sufficient to detect difference in those two sets.
4734
- //
4735
- uint32_t special_tokens_count_by_type = 0;
4736
- uint32_t special_tokens_count_from_verification = 0;
4737
-
4738
- bool special_tokens_definition_mismatch = false;
4739
-
4740
- for (const auto & t : vocab.token_to_id) {
4741
- const auto & token = t.first;
4742
- const auto & id = t.second;
4743
-
4744
- // Count all non-normal tokens in the vocab while iterating
4833
+ for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
4745
4834
  if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
4746
- special_tokens_count_by_type++;
4835
+ vocab.cache_special_tokens.push_back(id);
4747
4836
  }
4837
+ }
4748
4838
 
4749
- // Skip single character tokens
4750
- if (token.length() > 1) {
4751
- bool is_tokenizable = false;
4752
-
4753
- // Split token string representation in two, in all possible ways
4754
- // and check if both halves can be matched to a valid token
4755
- for (unsigned i = 1; i < token.length();) {
4756
- const auto left = token.substr(0, i);
4757
- const auto right = token.substr(i);
4758
-
4759
- // check if we didnt partition in the middle of a utf sequence
4760
- auto utf = utf8_len(left.at(left.length() - 1));
4761
-
4762
- if (utf == 1) {
4763
- if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
4764
- vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
4765
- is_tokenizable = true;
4766
- break;
4767
- }
4768
- i++;
4769
- } else {
4770
- // skip over the rest of multibyte utf sequence
4771
- i += utf - 1;
4772
- }
4773
- }
4839
+ std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
4840
+ [&] (const llama_vocab::id a, const llama_vocab::id b) {
4841
+ return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
4842
+ }
4843
+ );
4774
4844
 
4775
- if (!is_tokenizable) {
4776
- // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
4777
- // it's faster to re-filter them here, since there are way less candidates now
4845
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
4846
+ }
4778
4847
 
4779
- // Calculate a total "utf" length of a token string representation
4780
- size_t utf8_str_len = 0;
4781
- for (unsigned i = 0; i < token.length();) {
4782
- utf8_str_len++;
4783
- i += utf8_len(token.at(i));
4784
- }
4848
+ // build token to piece caches
4849
+ {
4850
+ size_t size_cache = 0;
4785
4851
 
4786
- // And skip the ones which are one character
4787
- if (utf8_str_len > 1) {
4788
- // At this point what we have left are special tokens only
4789
- vocab.special_tokens_cache[token] = id;
4852
+ std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4853
+ std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
4790
4854
 
4791
- // Count manually found special tokens
4792
- special_tokens_count_from_verification++;
4855
+ for (uint32_t id = 0; id < n_vocab; ++id) {
4856
+ cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857
+ cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4793
4858
 
4794
- // If this manually found special token is not marked as such, flag a mismatch
4795
- if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
4796
- special_tokens_definition_mismatch = true;
4797
- }
4798
- }
4799
- }
4800
- }
4859
+ size_cache += cache_token_to_piece[id].size();
4860
+ size_cache += cache_token_to_piece_special[id].size();
4801
4861
  }
4802
4862
 
4803
- if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
4804
- LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
4805
- __func__,
4806
- special_tokens_count_from_verification, vocab.id_to_token.size(),
4807
- special_tokens_count_by_type, vocab.id_to_token.size()
4808
- );
4809
- } else {
4810
- LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
4811
- __func__,
4812
- special_tokens_count_from_verification, vocab.id_to_token.size()
4813
- );
4814
- }
4863
+ std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4864
+ std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4865
+
4866
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
4815
4867
  }
4816
4868
  }
4817
4869
 
@@ -4892,6 +4944,16 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4892
4944
  if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4893
4945
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4894
4946
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4947
+
4948
+ if (model.arch == LLM_ARCH_DEEPSEEK2) {
4949
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4950
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
4951
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4952
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4953
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4954
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4955
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
4956
+ }
4895
4957
  }
4896
4958
 
4897
4959
  // Returns false if cancelled by progress_callback
@@ -5048,8 +5110,6 @@ static bool llm_load_tensors(
5048
5110
  throw std::runtime_error("model has expert layers but no expert layers are used");
5049
5111
  }
5050
5112
 
5051
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
5052
-
5053
5113
  ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
5054
5114
  ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
5055
5115
  ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
@@ -5103,6 +5163,11 @@ static bool llm_load_tensors(
5103
5163
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5104
5164
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5105
5165
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5166
+
5167
+ // optional MLP bias
5168
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5169
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5170
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5106
5171
  } else {
5107
5172
  layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5108
5173
 
@@ -6210,6 +6275,70 @@ static bool llm_load_tensors(
6210
6275
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
6211
6276
  }
6212
6277
  } break;
6278
+ case LLM_ARCH_DEEPSEEK2:
6279
+ {
6280
+ bool is_lite = (hparams.n_layer == 27);
6281
+
6282
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
6283
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6284
+ const uint32_t q_lora_rank = hparams.n_lora_q;
6285
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
6286
+ const uint32_t n_ff_exp = hparams.n_ff_exp;
6287
+
6288
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6289
+
6290
+ // output
6291
+ {
6292
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6293
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
6294
+ }
6295
+
6296
+ for (int i = 0; i < n_layer; ++i) {
6297
+ ggml_context * ctx_layer = ctx_for_layer(i);
6298
+ ggml_context * ctx_split = ctx_for_layer_split(i);
6299
+
6300
+ auto & layer = model.layers[i];
6301
+
6302
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
6303
+ if (!is_lite) {
6304
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
6305
+ }
6306
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
6307
+
6308
+ if (!is_lite) {
6309
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
6310
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.n_head * hparams.n_embd_head_k});
6311
+ } else {
6312
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
6313
+ }
6314
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope});
6315
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, hparams.n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)});
6316
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {hparams.n_head * hparams.n_embd_head_v, n_embd});
6317
+
6318
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
6319
+
6320
+ if ((uint32_t) i < hparams.n_layer_dense_lead) {
6321
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
6322
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
6323
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
6324
+ } else {
6325
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
6326
+
6327
+ GGML_ASSERT(hparams.n_expert > 0);
6328
+ GGML_ASSERT(hparams.n_expert_used > 0);
6329
+
6330
+ // MoE branch
6331
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6332
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6333
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6334
+
6335
+ // Shared expert branch
6336
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6337
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * hparams.n_expert_shared, n_embd});
6338
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * hparams.n_expert_shared});
6339
+ }
6340
+ }
6341
+ } break;
6213
6342
  default:
6214
6343
  throw std::runtime_error("unknown architecture");
6215
6344
  }
@@ -6664,6 +6793,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
6664
6793
  int64_t n_expert_used,
6665
6794
  llm_ffn_op_type type_op,
6666
6795
  bool norm_w,
6796
+ bool scale_w,
6797
+ float w_scale,
6667
6798
  const llm_build_cb & cb,
6668
6799
  int il) {
6669
6800
  int64_t n_embd = cur->ne[0];
@@ -6695,6 +6826,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
6695
6826
 
6696
6827
  weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6697
6828
  }
6829
+ if (scale_w) {
6830
+ weights = ggml_scale(ctx, weights, w_scale);
6831
+ cb(weights, "ffn_moe_weights_scaled", il);
6832
+ }
6698
6833
 
6699
6834
  cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6700
6835
  ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@@ -7305,9 +7440,9 @@ struct llm_build_context {
7305
7440
  cb(cur, "ffn_norm", il);
7306
7441
 
7307
7442
  cur = llm_build_ffn(ctx0, cur,
7308
- model.layers[il].ffn_up, NULL,
7309
- model.layers[il].ffn_gate, NULL,
7310
- model.layers[il].ffn_down, NULL,
7443
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7444
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7445
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7311
7446
  NULL,
7312
7447
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7313
7448
  cb(cur, "ffn_out", il);
@@ -7325,6 +7460,7 @@ struct llm_build_context {
7325
7460
  model.layers[il].ffn_down_exps,
7326
7461
  n_expert, n_expert_used,
7327
7462
  LLM_FFN_SILU, true,
7463
+ false, 0.0,
7328
7464
  cb, il);
7329
7465
  cb(cur, "ffn_moe_out", il);
7330
7466
  }
@@ -7806,6 +7942,7 @@ struct llm_build_context {
7806
7942
  model.layers[il].ffn_down_exps,
7807
7943
  n_expert, n_expert_used,
7808
7944
  LLM_FFN_GELU, true,
7945
+ false, 0.0,
7809
7946
  cb, il);
7810
7947
  cb(cur, "ffn_moe_out", il);
7811
7948
 
@@ -7949,6 +8086,7 @@ struct llm_build_context {
7949
8086
  model.layers[il].ffn_down_exps,
7950
8087
  n_expert, n_expert_used,
7951
8088
  LLM_FFN_SILU, true,
8089
+ false, 0.0,
7952
8090
  cb, il);
7953
8091
  cb(cur, "ffn_moe_out", il);
7954
8092
 
@@ -9087,6 +9225,7 @@ struct llm_build_context {
9087
9225
  model.layers[il].ffn_down_exps,
9088
9226
  n_expert, n_expert_used,
9089
9227
  LLM_FFN_SILU, false,
9228
+ false, 0.0,
9090
9229
  cb, il);
9091
9230
  cb(cur, "ffn_moe_out", il);
9092
9231
 
@@ -10974,6 +11113,7 @@ struct llm_build_context {
10974
11113
  model.layers[il].ffn_down_exps,
10975
11114
  n_expert, n_expert_used,
10976
11115
  LLM_FFN_SILU, true,
11116
+ false, 0.0,
10977
11117
  cb, il);
10978
11118
  cb(cur, "ffn_moe_out", il);
10979
11119
 
@@ -11005,6 +11145,239 @@ struct llm_build_context {
11005
11145
 
11006
11146
  return gf;
11007
11147
  }
11148
+
11149
+ struct ggml_cgraph * build_deepseek2() {
11150
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
11151
+
11152
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11153
+ int32_t n_tokens = this->n_tokens;
11154
+
11155
+ bool is_lite = (hparams.n_layer == 27);
11156
+
11157
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
11158
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
11159
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
11160
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
11161
+ const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
11162
+
11163
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11164
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11165
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11166
+
11167
+ struct ggml_tensor * cur;
11168
+ struct ggml_tensor * inpL;
11169
+
11170
+ // {n_embd, n_tokens}
11171
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
11172
+
11173
+ // inp_pos - contains the positions
11174
+ struct ggml_tensor * inp_pos = build_inp_pos();
11175
+
11176
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11177
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
11178
+
11179
+ for (int il = 0; il < n_layer; ++il) {
11180
+ struct ggml_tensor * inpSA = inpL;
11181
+
11182
+ // norm
11183
+ cur = llm_build_norm(ctx0, inpL, hparams,
11184
+ model.layers[il].attn_norm, NULL,
11185
+ LLM_NORM_RMS, cb, il);
11186
+ cb(cur, "attn_norm", il);
11187
+
11188
+ // self_attention
11189
+ {
11190
+ struct ggml_tensor * q = NULL;
11191
+ if (!is_lite) {
11192
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
11193
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
11194
+ cb(q, "q", il);
11195
+
11196
+ q = llm_build_norm(ctx0, q, hparams,
11197
+ model.layers[il].attn_q_a_norm, NULL,
11198
+ LLM_NORM_RMS, cb, il);
11199
+ cb(q, "q", il);
11200
+
11201
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
11202
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
11203
+ cb(q, "q", il);
11204
+ } else {
11205
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11206
+ cb(q, "q", il);
11207
+ }
11208
+
11209
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11210
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11211
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11212
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11213
+ 0);
11214
+ cb(q_nope, "q_nope", il);
11215
+
11216
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11217
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11218
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11219
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11220
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11221
+ cb(q_pe, "q_pe", il);
11222
+
11223
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11224
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11225
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11226
+
11227
+ // split into {kv_lora_rank, n_tokens}
11228
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11229
+ kv_pe_compresseed->nb[1],
11230
+ 0);
11231
+ cb(kv_compressed, "kv_compressed", il);
11232
+
11233
+ // and {n_embd_head_qk_rope, n_tokens}
11234
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11235
+ kv_pe_compresseed->nb[1],
11236
+ kv_pe_compresseed->nb[1],
11237
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11238
+ cb(k_pe, "k_pe", il);
11239
+
11240
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
11241
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
11242
+ model.layers[il].attn_kv_a_norm, NULL,
11243
+ LLM_NORM_RMS, cb, il);
11244
+ cb(kv_compressed, "kv_compressed", il);
11245
+
11246
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11247
+ struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11248
+ cb(kv, "kv", il);
11249
+
11250
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11251
+ struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11252
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11253
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11254
+ 0);
11255
+ cb(k_nope, "k_nope", il);
11256
+
11257
+ // and {n_head * n_embd_head_v, n_tokens}
11258
+ struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11259
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11260
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11261
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11262
+ cb(v_states, "v_states", il);
11263
+
11264
+ v_states = ggml_cont(ctx0, v_states);
11265
+ cb(v_states, "v_states", il);
11266
+
11267
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11268
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11269
+ 0);
11270
+ cb(v_states, "v_states", il);
11271
+
11272
+ q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11273
+ q_pe = ggml_rope_ext(
11274
+ ctx0, q_pe, inp_pos, nullptr,
11275
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11276
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11277
+ );
11278
+ cb(q_pe, "q_pe", il);
11279
+
11280
+ // shared RoPE key
11281
+ k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
11282
+ k_pe = ggml_rope_ext(
11283
+ ctx0, k_pe, inp_pos, nullptr,
11284
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
11285
+ ext_factor, attn_factor_scaled, beta_fast, beta_slow
11286
+ );
11287
+ cb(k_pe, "k_pe", il);
11288
+
11289
+ struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11290
+ cb(q_states, "q_states", il);
11291
+
11292
+ struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11293
+ cb(k_states, "k_states", il);
11294
+
11295
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
11296
+ model.layers[il].wo, NULL,
11297
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11298
+ }
11299
+
11300
+ if (il == n_layer - 1) {
11301
+ // skip computing output for unused tokens
11302
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11303
+ n_tokens = n_outputs;
11304
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11305
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11306
+ }
11307
+
11308
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11309
+ cb(ffn_inp, "ffn_inp", il);
11310
+
11311
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
11312
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11313
+ model.layers[il].ffn_norm, NULL,
11314
+ LLM_NORM_RMS, cb, il);
11315
+ cb(cur, "ffn_norm", il);
11316
+
11317
+ cur = llm_build_ffn(ctx0, cur,
11318
+ model.layers[il].ffn_up, NULL,
11319
+ model.layers[il].ffn_gate, NULL,
11320
+ model.layers[il].ffn_down, NULL,
11321
+ NULL,
11322
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11323
+ cb(cur, "ffn_out", il);
11324
+ } else {
11325
+ // MoE branch
11326
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11327
+ model.layers[il].ffn_norm, NULL,
11328
+ LLM_NORM_RMS, cb, il);
11329
+ cb(cur, "ffn_norm", il);
11330
+
11331
+ ggml_tensor * moe_out =
11332
+ llm_build_moe_ffn(ctx0, cur,
11333
+ model.layers[il].ffn_gate_inp,
11334
+ model.layers[il].ffn_up_exps,
11335
+ model.layers[il].ffn_gate_exps,
11336
+ model.layers[il].ffn_down_exps,
11337
+ n_expert, n_expert_used,
11338
+ LLM_FFN_SILU, false,
11339
+ true, hparams.expert_weights_scale,
11340
+ cb, il);
11341
+ cb(moe_out, "ffn_moe_out", il);
11342
+
11343
+ // FFN shared expert
11344
+ {
11345
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
11346
+ model.layers[il].ffn_up_shexp, NULL,
11347
+ model.layers[il].ffn_gate_shexp, NULL,
11348
+ model.layers[il].ffn_down_shexp, NULL,
11349
+ NULL,
11350
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11351
+ cb(ffn_shexp, "ffn_shexp", il);
11352
+
11353
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
11354
+ cb(cur, "ffn_out", il);
11355
+ }
11356
+ }
11357
+
11358
+ cur = ggml_add(ctx0, cur, ffn_inp);
11359
+ cb(cur, "l_out", il);
11360
+
11361
+ // input for next layer
11362
+ inpL = cur;
11363
+ }
11364
+
11365
+ cur = inpL;
11366
+
11367
+ cur = llm_build_norm(ctx0, cur, hparams,
11368
+ model.output_norm, NULL,
11369
+ LLM_NORM_RMS, cb, -1);
11370
+ cb(cur, "result_norm", -1);
11371
+
11372
+ // lm_head
11373
+ cur = ggml_mul_mat(ctx0, model.output, cur);
11374
+ cb(cur, "result_output", -1);
11375
+
11376
+ ggml_build_forward_expand(gf, cur);
11377
+
11378
+ return gf;
11379
+ }
11380
+
11008
11381
  };
11009
11382
 
11010
11383
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -11223,6 +11596,10 @@ static struct ggml_cgraph * llama_build_graph(
11223
11596
  {
11224
11597
  result = llm.build_arctic();
11225
11598
  } break;
11599
+ case LLM_ARCH_DEEPSEEK2:
11600
+ {
11601
+ result = llm.build_deepseek2();
11602
+ } break;
11226
11603
  default:
11227
11604
  GGML_ASSERT(false);
11228
11605
  }
@@ -12512,6 +12889,7 @@ struct llm_tokenizer_bpe {
12512
12889
  });
12513
12890
  break;
12514
12891
  case LLAMA_VOCAB_PRE_TYPE_DBRX:
12892
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12515
12893
  word_collection = unicode_regex_split(text, {
12516
12894
  // same as llama3
12517
12895
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12734,7 +13112,7 @@ struct llm_tokenizer_wpm {
12734
13112
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
12735
13113
 
12736
13114
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12737
- auto * token_map = &vocab.token_to_id;
13115
+ const auto & token_map = vocab.token_to_id;
12738
13116
 
12739
13117
  // normalize and split by whitespace
12740
13118
  std::vector<std::string> words = preprocess(text);
@@ -12749,108 +13127,89 @@ struct llm_tokenizer_wpm {
12749
13127
  }
12750
13128
 
12751
13129
  // prepend phantom space
12752
- std::string word1 = "\xe2\x96\x81" + word;
12753
- int n = word1.size();
13130
+ const std::string word1 = "\xe2\x96\x81" + word;
13131
+ const int n = word1.size();
12754
13132
 
12755
- // we're at the start of a new word
12756
- int i = 0;
12757
- bool match_any = false;
13133
+ const size_t current_tokens = output.size();
12758
13134
 
13135
+ // we're at the start of a new word
12759
13136
  // move through character position in word
12760
- while (i < n) {
13137
+ for (int i = 0; i < n; ++i) {
12761
13138
  // loop through possible match length
12762
13139
  bool match = false;
12763
13140
  for (int j = n; j > i; j--) {
12764
- auto it = token_map->find(word1.substr(i, j - i));
12765
- if (it != token_map->end()) {
13141
+ auto it = token_map.find(word1.substr(i, j - i));
13142
+ if (it != token_map.end()) {
12766
13143
  output.push_back(it->second);
12767
13144
  match = true;
12768
- match_any = true;
12769
- i = j;
13145
+ i = j - 1;
12770
13146
  break;
12771
13147
  }
12772
13148
  }
12773
13149
 
12774
- // must be an unknown character
12775
- if (!match) {
12776
- i++;
13150
+ if (!match) { // discard all
13151
+ output.resize(current_tokens);
13152
+ break; // and discard next tokens
12777
13153
  }
12778
13154
  }
12779
13155
 
12780
13156
  // we didn't find any matches for this word
12781
- if (!match_any) {
13157
+ if (current_tokens == output.size()) {
12782
13158
  output.push_back(vocab.special_unk_id);
12783
13159
  }
12784
13160
  }
12785
13161
  }
12786
13162
 
12787
13163
  std::vector<std::string> preprocess(const std::string & text) {
12788
- std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
12789
-
12790
- // strip accents, strip control, uniformize whitespace,
12791
- // to lowercase, pad chinese characters, pad punctuation
12792
- std::string new_str = "";
12793
- for (uint32_t code : cpts_nfd) {
12794
- const codepoint_flags flags = unicode_cpt_flags(code);
12795
- if (flags.is_accent_mark || flags.is_control) {
13164
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13165
+ std::vector<std::string> words(1, "");
13166
+
13167
+ for (const char32_t cpt : cpts_nfd) {
13168
+ const auto flags = unicode_cpt_flags(cpt);
13169
+
13170
+ if (flags.is_whitespace) {
13171
+ if (words.back().size()) { // finish previous word if any
13172
+ words.emplace_back();
13173
+ }
12796
13174
  continue;
12797
13175
  }
12798
- code = unicode_tolower(code);
12799
- if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12800
- code = ' ';
12801
- }
12802
- std::string s = unicode_cpt_to_utf8(code);
12803
- if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12804
- new_str += " ";
12805
- new_str += s;
12806
- new_str += " ";
12807
- } else {
12808
- new_str += s;
13176
+
13177
+ assert (!flags.is_separator);
13178
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
13179
+ continue;
12809
13180
  }
12810
- }
12811
13181
 
12812
- // split by whitespace
12813
- uint64_t l = 0;
12814
- uint64_t r = 0;
12815
- std::vector<std::string> words;
12816
- while (r < new_str.size()) {
12817
- // if is whitespace
12818
- if (isspace(new_str[r], std::locale::classic())) {
12819
- if (r > l) words.push_back(new_str.substr(l, (r - l)));
12820
- l = r + 1;
12821
- r = l;
13182
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
13183
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
13184
+ if (words.back().size()) { // finish previous word if any
13185
+ words.emplace_back();
13186
+ }
13187
+ words.back() = s; // single char word
13188
+ words.emplace_back(); // start a new word
12822
13189
  } else {
12823
- r += 1;
13190
+ words.back() += s; // append char to word
12824
13191
  }
12825
13192
  }
12826
- if (r > l) {
12827
- words.push_back(new_str.substr(l, (r - l)));
12828
- }
12829
- return words;
12830
- }
12831
13193
 
12832
- bool is_ascii_punct(uint32_t code) {
12833
- if (code > 0xFF) {
12834
- return false;
13194
+ if (!words.back().size()) {
13195
+ words.pop_back();
12835
13196
  }
12836
- auto c = char(static_cast<unsigned char>(code));
12837
- return ispunct(c, std::locale::classic());
13197
+
13198
+ return words;
12838
13199
  }
12839
13200
 
12840
- bool is_chinese_char(uint32_t cpt) {
12841
- if ((cpt >= 0x4E00 && cpt <= 0x9FFF) ||
12842
- (cpt >= 0x3400 && cpt <= 0x4DBF) ||
13201
+ static bool is_chinese_char(uint32_t cpt) {
13202
+ return
13203
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
13204
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
12843
13205
  (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
12844
13206
  (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
12845
13207
  (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
12846
13208
  (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
12847
- (cpt >= 0xF900 && cpt <= 0xFAFF) ||
12848
- (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
12849
- (cpt >= 0x3000 && cpt <= 0x303F) ||
12850
- (cpt >= 0xFF00 && cpt <= 0xFFEF)) {
12851
- return true; // NOLINT
12852
- }
12853
- return false;
13209
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
13210
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
13211
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
13212
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
12854
13213
  }
12855
13214
 
12856
13215
  const llama_vocab & vocab;
@@ -12894,9 +13253,8 @@ struct fragment_buffer_variant {
12894
13253
 
12895
13254
  static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
12896
13255
  // for each special token
12897
- for (const auto & st: vocab.special_tokens_cache) {
12898
- const auto & special_token = st.first;
12899
- const auto & special_id = st.second;
13256
+ for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
13257
+ const auto & special_token = vocab.id_to_token[special_id].text;
12900
13258
 
12901
13259
  // for each text fragment
12902
13260
  std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -12905,7 +13263,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12905
13263
 
12906
13264
  // if a fragment is text ( not yet processed )
12907
13265
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12908
- auto * raw_text = &(fragment.raw_text);
13266
+ auto & raw_text = fragment.raw_text;
12909
13267
 
12910
13268
  auto raw_text_base_offset = fragment.offset;
12911
13269
  auto raw_text_base_length = fragment.length;
@@ -12915,7 +13273,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12915
13273
  // find the first occurrence of a given special token in this fragment
12916
13274
  // passing offset argument only limit the "search area" but match coordinates
12917
13275
  // are still relative to the source full raw_text
12918
- auto match = raw_text->find(special_token, raw_text_base_offset);
13276
+ auto match = raw_text.find(special_token, raw_text_base_offset);
12919
13277
 
12920
13278
  // no occurrences found, stop processing this fragment for a given special token
12921
13279
  if (match == std::string::npos) break;
@@ -12934,7 +13292,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12934
13292
  // left
12935
13293
  const int64_t left_reminder_offset = raw_text_base_offset + 0;
12936
13294
  const int64_t left_reminder_length = match - raw_text_base_offset;
12937
- buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
13295
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
12938
13296
 
12939
13297
  #ifdef PRETOKENIZERDEBUG
12940
13298
  LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
@@ -12950,7 +13308,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
12950
13308
  if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
12951
13309
  const int64_t right_reminder_offset = match + special_token.length();
12952
13310
  const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
12953
- buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
13311
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
12954
13312
 
12955
13313
  #ifdef PRETOKENIZERDEBUG
12956
13314
  LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
@@ -14054,7 +14412,7 @@ void llama_sample_repetition_penalties(
14054
14412
 
14055
14413
  void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
14056
14414
  GGML_ASSERT(ctx);
14057
- const int64_t t_start_sample_us = ggml_time_us();
14415
+ int64_t t_start_sample_us = ggml_time_us();
14058
14416
 
14059
14417
  bool allow_eog = false;
14060
14418
  for (const auto & stack : grammar->stacks) {
@@ -14066,12 +14424,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
14066
14424
 
14067
14425
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
14068
14426
  candidates_decoded.reserve(candidates->size);
14069
- std::vector<llama_grammar_candidate> candidates_grammar;
14427
+
14428
+ std::vector<llama_grammar_candidate> candidates_grammar;
14070
14429
  candidates_grammar.reserve(candidates->size);
14071
14430
 
14072
14431
  for (size_t i = 0; i < candidates->size; ++i) {
14073
- const llama_token id = candidates->data[i].id;
14074
- const std::string piece = llama_token_to_piece(ctx, id, false);
14432
+ const llama_token id = candidates->data[i].id;
14433
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
14075
14434
 
14076
14435
  if (llama_token_is_eog(&ctx->model, id)) {
14077
14436
  if (!allow_eog) {
@@ -14271,7 +14630,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
14271
14630
  GGML_ASSERT(false);
14272
14631
  }
14273
14632
 
14274
- const std::string piece = llama_token_to_piece(ctx, token, false);
14633
+ const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
14275
14634
 
14276
14635
  // Note terminating 0 in decoded string
14277
14636
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -16235,6 +16594,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
16235
16594
  case LLM_ARCH_COMMAND_R:
16236
16595
  case LLM_ARCH_OLMO:
16237
16596
  case LLM_ARCH_ARCTIC:
16597
+ case LLM_ARCH_DEEPSEEK2:
16238
16598
  return LLAMA_ROPE_TYPE_NORM;
16239
16599
 
16240
16600
  // the pairs of head values are offset by n_rot/2
@@ -17861,6 +18221,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17861
18221
  );
17862
18222
  }
17863
18223
 
18224
+ bool llama_token_is_control(const struct llama_model * model, llama_token token) {
18225
+ return llama_is_control_token(model->vocab, token);
18226
+ }
18227
+
17864
18228
  llama_token llama_token_bos(const struct llama_model * model) {
17865
18229
  return model->vocab.special_bos_id;
17866
18230
  }
@@ -17932,7 +18296,16 @@ static std::string llama_decode_text(const std::string & text) {
17932
18296
 
17933
18297
  const auto cpts = unicode_cpts_from_utf8(text);
17934
18298
  for (const auto cpt : cpts) {
17935
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
18299
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
18300
+ try {
18301
+ decoded_text += unicode_utf8_to_byte(utf8);
18302
+ } catch (const std::out_of_range & e) {
18303
+ decoded_text += "[UNK_BYTE_0x";
18304
+ for (const auto c : utf8) {
18305
+ decoded_text += format("%02x", (uint8_t) c);
18306
+ }
18307
+ decoded_text += text + "]";
18308
+ }
17936
18309
  }
17937
18310
 
17938
18311
  return decoded_text;
@@ -17940,69 +18313,83 @@ static std::string llama_decode_text(const std::string & text) {
17940
18313
 
17941
18314
  // does not write null-terminator to buf
17942
18315
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18316
+ // if we have a cache - use it
18317
+ {
18318
+ const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18319
+
18320
+ if (!cache.empty()) {
18321
+ const auto & res = cache.at(token);
18322
+ if (length < (int) res.size()) {
18323
+ return -(int) res.size();
18324
+ }
18325
+ memcpy(buf, res.c_str(), res.size());
18326
+ return res.size();
18327
+ }
18328
+ }
18329
+
17943
18330
  if (0 <= token && token < llama_n_vocab(model)) {
17944
18331
  switch (llama_vocab_get_type(model->vocab)) {
17945
- case LLAMA_VOCAB_TYPE_WPM:
17946
- case LLAMA_VOCAB_TYPE_SPM: {
17947
- // NOTE: we accept all unsupported token types,
17948
- // suppressing them like CONTROL tokens.
17949
- if (llama_is_normal_token(model->vocab, token)) {
17950
- std::string result = model->vocab.id_to_token[token].text;
17951
- llama_unescape_whitespace(result);
17952
- if (length < (int) result.length()) {
17953
- return -(int) result.length();
17954
- }
17955
- memcpy(buf, result.c_str(), result.length());
17956
- return result.length();
17957
- } else if (
17958
- (llama_is_user_defined_token(model->vocab, token)) ||
17959
- (llama_is_control_token (model->vocab, token) && special)) {
17960
- std::string result = model->vocab.id_to_token[token].text;
17961
- if (length < (int) result.length()) {
17962
- return -(int) result.length();
17963
- }
17964
- memcpy(buf, result.c_str(), result.length());
17965
- return result.length();
17966
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
17967
- if (length < 3) {
17968
- return -3;
17969
- }
17970
- memcpy(buf, "\xe2\x96\x85", 3);
17971
- return 3;
17972
- } else if (llama_is_byte_token(model->vocab, token)) {
17973
- if (length < 1) {
17974
- return -1;
18332
+ case LLAMA_VOCAB_TYPE_WPM:
18333
+ case LLAMA_VOCAB_TYPE_SPM: {
18334
+ // NOTE: we accept all unsupported token types,
18335
+ // suppressing them like CONTROL tokens.
18336
+ if (llama_is_normal_token(model->vocab, token)) {
18337
+ std::string result = model->vocab.id_to_token[token].text;
18338
+ llama_unescape_whitespace(result);
18339
+ if (length < (int) result.length()) {
18340
+ return -(int) result.length();
18341
+ }
18342
+ memcpy(buf, result.c_str(), result.length());
18343
+ return result.length();
18344
+ } else if (
18345
+ (llama_is_user_defined_token(model->vocab, token)) ||
18346
+ (llama_is_control_token (model->vocab, token) && special)) {
18347
+ std::string result = model->vocab.id_to_token[token].text;
18348
+ if (length < (int) result.length()) {
18349
+ return -(int) result.length();
18350
+ }
18351
+ memcpy(buf, result.c_str(), result.length());
18352
+ return result.length();
18353
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
18354
+ if (length < 3) {
18355
+ return -3;
18356
+ }
18357
+ memcpy(buf, "\xe2\x96\x85", 3);
18358
+ return 3;
18359
+ } else if (llama_is_byte_token(model->vocab, token)) {
18360
+ if (length < 1) {
18361
+ return -1;
18362
+ }
18363
+ buf[0] = llama_token_to_byte(model->vocab, token);
18364
+ return 1;
17975
18365
  }
17976
- buf[0] = llama_token_to_byte(model->vocab, token);
17977
- return 1;
18366
+ break;
17978
18367
  }
17979
- break;
17980
- }
17981
- case LLAMA_VOCAB_TYPE_BPE: {
17982
- // NOTE: we accept all unsupported token types,
17983
- // suppressing them like CONTROL tokens.
17984
- if (llama_is_normal_token(model->vocab, token)) {
17985
- std::string result = model->vocab.id_to_token[token].text;
17986
- result = llama_decode_text(result);
17987
- if (length < (int) result.length()) {
17988
- return -(int) result.length();
17989
- }
17990
- memcpy(buf, result.c_str(), result.length());
17991
- return result.length();
17992
- } else if (
17993
- (llama_is_user_defined_token(model->vocab, token)) ||
17994
- (llama_is_control_token (model->vocab, token) && special)) {
17995
- std::string result = model->vocab.id_to_token[token].text;
17996
- if (length < (int) result.length()) {
17997
- return -(int) result.length();
18368
+ case LLAMA_VOCAB_TYPE_BPE: {
18369
+ // NOTE: we accept all unsupported token types,
18370
+ // suppressing them like CONTROL tokens.
18371
+ if (llama_is_normal_token(model->vocab, token)) {
18372
+ std::string result = model->vocab.id_to_token[token].text;
18373
+ result = llama_decode_text(result);
18374
+ if (length < (int) result.length()) {
18375
+ return -(int) result.length();
18376
+ }
18377
+ memcpy(buf, result.c_str(), result.length());
18378
+ return result.length();
18379
+ } else if (
18380
+ (llama_is_user_defined_token(model->vocab, token)) ||
18381
+ (llama_is_control_token (model->vocab, token) && special)) {
18382
+ std::string result = model->vocab.id_to_token[token].text;
18383
+ if (length < (int) result.length()) {
18384
+ return -(int) result.length();
18385
+ }
18386
+ memcpy(buf, result.c_str(), result.length());
18387
+ return result.length();
17998
18388
  }
17999
- memcpy(buf, result.c_str(), result.length());
18000
- return result.length();
18389
+ break;
18001
18390
  }
18002
- break;
18003
- }
18004
- default:
18005
- GGML_ASSERT(false);
18391
+ default:
18392
+ GGML_ASSERT(false);
18006
18393
  }
18007
18394
  }
18008
18395
  return 0;
@@ -18337,6 +18724,7 @@ const char * llama_print_system_info(void) {
18337
18724
  s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
18338
18725
  s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
18339
18726
  s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
18727
+ s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
18340
18728
  s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
18341
18729
  s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
18342
18730
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";