llama_cpp 0.14.4 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,6 +209,7 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
213
214
  LLM_ARCH_PLAMO,
214
215
  LLM_ARCH_CODESHELL,
@@ -220,6 +221,8 @@ enum llm_arch {
220
221
  LLM_ARCH_MAMBA,
221
222
  LLM_ARCH_XVERSE,
222
223
  LLM_ARCH_COMMAND_R,
224
+ LLM_ARCH_DBRX,
225
+ LLM_ARCH_OLMO,
223
226
  LLM_ARCH_UNKNOWN,
224
227
  };
225
228
 
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
244
  { LLM_ARCH_STABLELM, "stablelm" },
242
245
  { LLM_ARCH_QWEN, "qwen" },
243
246
  { LLM_ARCH_QWEN2, "qwen2" },
247
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
248
  { LLM_ARCH_PHI2, "phi2" },
245
249
  { LLM_ARCH_PLAMO, "plamo" },
246
250
  { LLM_ARCH_CODESHELL, "codeshell" },
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
256
  { LLM_ARCH_MAMBA, "mamba" },
253
257
  { LLM_ARCH_XVERSE, "xverse" },
254
258
  { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
255
261
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
262
  };
257
263
 
@@ -261,6 +267,7 @@ enum llm_kv {
261
267
  LLM_KV_GENERAL_ALIGNMENT,
262
268
  LLM_KV_GENERAL_NAME,
263
269
  LLM_KV_GENERAL_AUTHOR,
270
+ LLM_KV_GENERAL_VERSION,
264
271
  LLM_KV_GENERAL_URL,
265
272
  LLM_KV_GENERAL_DESCRIPTION,
266
273
  LLM_KV_GENERAL_LICENSE,
@@ -317,11 +324,17 @@ enum llm_kv {
317
324
  LLM_KV_TOKENIZER_UNK_ID,
318
325
  LLM_KV_TOKENIZER_SEP_ID,
319
326
  LLM_KV_TOKENIZER_PAD_ID,
327
+ LLM_KV_TOKENIZER_CLS_ID,
328
+ LLM_KV_TOKENIZER_MASK_ID,
320
329
  LLM_KV_TOKENIZER_ADD_BOS,
321
330
  LLM_KV_TOKENIZER_ADD_EOS,
322
331
  LLM_KV_TOKENIZER_ADD_PREFIX,
323
332
  LLM_KV_TOKENIZER_HF_JSON,
324
333
  LLM_KV_TOKENIZER_RWKV,
334
+ LLM_KV_TOKENIZER_PREFIX_ID,
335
+ LLM_KV_TOKENIZER_SUFFIX_ID,
336
+ LLM_KV_TOKENIZER_MIDDLE_ID,
337
+ LLM_KV_TOKENIZER_EOT_ID,
325
338
  };
326
339
 
327
340
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -330,6 +343,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
330
343
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
331
344
  { LLM_KV_GENERAL_NAME, "general.name" },
332
345
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
346
+ { LLM_KV_GENERAL_VERSION, "general.version" },
333
347
  { LLM_KV_GENERAL_URL, "general.url" },
334
348
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
335
349
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -386,11 +400,17 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
400
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
387
401
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
388
402
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
403
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
404
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
389
405
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
390
406
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
391
407
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
392
408
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
393
409
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
410
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
411
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
412
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
413
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
394
414
  };
395
415
 
396
416
  struct LLM_KV {
@@ -421,6 +441,7 @@ enum llm_tensor {
421
441
  LLM_TENSOR_ATTN_OUT_NORM,
422
442
  LLM_TENSOR_ATTN_ROT_EMBD,
423
443
  LLM_TENSOR_FFN_GATE_INP,
444
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
424
445
  LLM_TENSOR_FFN_NORM,
425
446
  LLM_TENSOR_FFN_GATE,
426
447
  LLM_TENSOR_FFN_DOWN,
@@ -432,6 +453,9 @@ enum llm_tensor {
432
453
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
454
  LLM_TENSOR_FFN_GATE_EXPS,
434
455
  LLM_TENSOR_FFN_UP_EXPS,
456
+ LLM_TENSOR_FFN_DOWN_SHEXP,
457
+ LLM_TENSOR_FFN_GATE_SHEXP,
458
+ LLM_TENSOR_FFN_UP_SHEXP,
435
459
  LLM_TENSOR_ATTN_Q_NORM,
436
460
  LLM_TENSOR_ATTN_K_NORM,
437
461
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -694,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
694
718
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
695
719
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
696
720
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
721
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
722
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
697
723
  },
698
724
  },
699
725
  {
@@ -729,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
729
755
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
730
756
  },
731
757
  },
758
+ {
759
+ LLM_ARCH_QWEN2MOE,
760
+ {
761
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
762
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
763
+ { LLM_TENSOR_OUTPUT, "output" },
764
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
765
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
766
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
767
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
768
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
769
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
770
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
771
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
772
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
773
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
774
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
775
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
776
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
777
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
778
+ },
779
+ },
732
780
  {
733
781
  LLM_ARCH_PHI2,
734
782
  {
@@ -924,6 +972,38 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
924
972
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
973
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
974
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
975
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
976
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
977
+ },
978
+ },
979
+ {
980
+ LLM_ARCH_DBRX,
981
+ {
982
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
983
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
984
+ { LLM_TENSOR_OUTPUT, "output" },
985
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
986
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
987
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
988
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
989
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
990
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
991
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
992
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
993
+ },
994
+ },
995
+ {
996
+ LLM_ARCH_OLMO,
997
+ {
998
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
999
+ { LLM_TENSOR_OUTPUT, "output" },
1000
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1001
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1002
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1003
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1004
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1005
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1006
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
927
1007
  },
928
1008
  },
929
1009
  {
@@ -1630,17 +1710,17 @@ static size_t llama_get_device_memory(int device) {
1630
1710
  #if defined(GGML_USE_CUDA)
1631
1711
  size_t total;
1632
1712
  size_t free;
1633
- ggml_backend_cuda_get_device_memory(device, &total, &free);
1713
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
1634
1714
  return free;
1635
1715
  #elif defined(GGML_USE_SYCL)
1636
1716
  size_t total;
1637
1717
  size_t free;
1638
- ggml_backend_sycl_get_device_memory(device, &total, &free);
1718
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
1639
1719
  return free;
1640
1720
  #elif defined(GGML_USE_VULKAN)
1641
1721
  size_t total;
1642
1722
  size_t free;
1643
- ggml_backend_vk_get_device_memory(device, &total, &free);
1723
+ ggml_backend_vk_get_device_memory(device, &free, &total);
1644
1724
  return free;
1645
1725
  #else
1646
1726
  return 1;
@@ -1682,6 +1762,7 @@ enum e_model {
1682
1762
  MODEL_4B,
1683
1763
  MODEL_7B,
1684
1764
  MODEL_8B,
1765
+ MODEL_12B,
1685
1766
  MODEL_13B,
1686
1767
  MODEL_14B,
1687
1768
  MODEL_15B,
@@ -1697,6 +1778,10 @@ enum e_model {
1697
1778
  MODEL_MEDIUM,
1698
1779
  MODEL_LARGE,
1699
1780
  MODEL_XL,
1781
+ MODEL_A2_7B,
1782
+ MODEL_8x7B,
1783
+ MODEL_8x22B,
1784
+ MODEL_16x12B,
1700
1785
  };
1701
1786
 
1702
1787
  static const size_t kiB = 1024;
@@ -1880,6 +1965,12 @@ struct llama_layer {
1880
1965
  struct ggml_tensor * ffn_down_exps;
1881
1966
  struct ggml_tensor * ffn_up_exps ;
1882
1967
 
1968
+ // ff shared expert (shexp)
1969
+ struct ggml_tensor * ffn_gate_inp_shexp;
1970
+ struct ggml_tensor * ffn_gate_shexp;
1971
+ struct ggml_tensor * ffn_down_shexp;
1972
+ struct ggml_tensor * ffn_up_shexp;
1973
+
1883
1974
  // ff bias
1884
1975
  struct ggml_tensor * ffn_down_b; // b2
1885
1976
  struct ggml_tensor * ffn_up_b; // b3
@@ -2014,20 +2105,22 @@ struct llama_vocab {
2014
2105
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2015
2106
 
2016
2107
  // default LLaMA special tokens
2017
- id special_bos_id = 1;
2018
- id special_eos_id = 2;
2019
- id special_unk_id = 0;
2020
- id special_sep_id = -1;
2021
- id special_pad_id = -1;
2108
+ id special_bos_id = 1;
2109
+ id special_eos_id = 2;
2110
+ id special_unk_id = 0;
2111
+ id special_sep_id = -1;
2112
+ id special_pad_id = -1;
2113
+ id special_cls_id = -1;
2114
+ id special_mask_id = -1;
2022
2115
 
2023
2116
  int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2024
2117
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2025
2118
 
2026
2119
  id linefeed_id = 13;
2027
- id special_prefix_id = 32007;
2028
- id special_middle_id = 32009;
2029
- id special_suffix_id = 32008;
2030
- id special_eot_id = 32010;
2120
+ id special_prefix_id = -1;
2121
+ id special_suffix_id = -1;
2122
+ id special_middle_id = -1;
2123
+ id special_eot_id = -1;
2031
2124
 
2032
2125
  bool add_space_prefix = true;
2033
2126
 
@@ -2175,7 +2268,7 @@ struct llama_context {
2175
2268
 
2176
2269
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
2270
  size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2271
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
2179
2272
 
2180
2273
  bool logits_all = false;
2181
2274
 
@@ -3533,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
3533
3626
  case MODEL_3B: return "3B";
3534
3627
  case MODEL_7B: return "7B";
3535
3628
  case MODEL_8B: return "8B";
3629
+ case MODEL_12B: return "12B";
3536
3630
  case MODEL_13B: return "13B";
3537
3631
  case MODEL_14B: return "14B";
3538
3632
  case MODEL_15B: return "15B";
@@ -3548,6 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
3548
3642
  case MODEL_MEDIUM: return "0.4B";
3549
3643
  case MODEL_LARGE: return "0.8B";
3550
3644
  case MODEL_XL: return "1.5B";
3645
+ case MODEL_A2_7B: return "A2.7B";
3646
+ case MODEL_8x7B: return "8x7B";
3647
+ case MODEL_8x22B: return "8x22B";
3648
+ case MODEL_16x12B: return "16x12B";
3551
3649
  default: return "?B";
3552
3650
  }
3553
3651
  }
@@ -3662,15 +3760,23 @@ static void llm_load_hparams(
3662
3760
  {
3663
3761
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3664
3762
 
3665
- switch (hparams.n_layer) {
3666
- case 22: model.type = e_model::MODEL_1B; break;
3667
- case 26: model.type = e_model::MODEL_3B; break;
3668
- case 32: model.type = e_model::MODEL_7B; break;
3669
- case 40: model.type = e_model::MODEL_13B; break;
3670
- case 48: model.type = e_model::MODEL_34B; break;
3671
- case 60: model.type = e_model::MODEL_30B; break;
3672
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3673
- default: model.type = e_model::MODEL_UNKNOWN;
3763
+ if (hparams.n_expert == 8) {
3764
+ switch (hparams.n_layer) {
3765
+ case 32: model.type = e_model::MODEL_8x7B; break;
3766
+ case 56: model.type = e_model::MODEL_8x22B; break;
3767
+ default: model.type = e_model::MODEL_UNKNOWN;
3768
+ }
3769
+ } else {
3770
+ switch (hparams.n_layer) {
3771
+ case 22: model.type = e_model::MODEL_1B; break;
3772
+ case 26: model.type = e_model::MODEL_3B; break;
3773
+ case 32: model.type = e_model::MODEL_7B; break;
3774
+ case 40: model.type = e_model::MODEL_13B; break;
3775
+ case 48: model.type = e_model::MODEL_34B; break;
3776
+ case 60: model.type = e_model::MODEL_30B; break;
3777
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3778
+ default: model.type = e_model::MODEL_UNKNOWN;
3779
+ }
3674
3780
  }
3675
3781
  } break;
3676
3782
  case LLM_ARCH_MINICPM:
@@ -3812,6 +3918,7 @@ static void llm_load_hparams(
3812
3918
  switch (hparams.n_layer) {
3813
3919
  case 24: model.type = e_model::MODEL_1B; break;
3814
3920
  case 32: model.type = e_model::MODEL_3B; break;
3921
+ case 40: model.type = e_model::MODEL_12B; break;
3815
3922
  default: model.type = e_model::MODEL_UNKNOWN;
3816
3923
  }
3817
3924
  } break;
@@ -3836,6 +3943,14 @@ static void llm_load_hparams(
3836
3943
  default: model.type = e_model::MODEL_UNKNOWN;
3837
3944
  }
3838
3945
  } break;
3946
+ case LLM_ARCH_QWEN2MOE:
3947
+ {
3948
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3949
+ switch (hparams.n_layer) {
3950
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3951
+ default: model.type = e_model::MODEL_UNKNOWN;
3952
+ }
3953
+ } break;
3839
3954
  case LLM_ARCH_PHI2:
3840
3955
  {
3841
3956
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3961,6 +4076,28 @@ static void llm_load_hparams(
3961
4076
  default: model.type = e_model::MODEL_UNKNOWN;
3962
4077
  }
3963
4078
  } break;
4079
+ case LLM_ARCH_DBRX:
4080
+ {
4081
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4082
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4083
+
4084
+ switch (hparams.n_layer) {
4085
+ case 40: model.type = e_model::MODEL_16x12B; break;
4086
+ default: model.type = e_model::MODEL_UNKNOWN;
4087
+ }
4088
+ } break;
4089
+ case LLM_ARCH_OLMO:
4090
+ {
4091
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4092
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4093
+
4094
+ switch (hparams.n_layer) {
4095
+ case 22: model.type = e_model::MODEL_1B; break;
4096
+ case 32: model.type = e_model::MODEL_7B; break;
4097
+ case 80: model.type = e_model::MODEL_70B; break;
4098
+ default: model.type = e_model::MODEL_UNKNOWN;
4099
+ }
4100
+ } break;
3964
4101
  default: (void)0;
3965
4102
  }
3966
4103
 
@@ -3974,7 +4111,9 @@ static void llm_load_hparams(
3974
4111
  }
3975
4112
 
3976
4113
  // TODO: This should probably be in llama.h
3977
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
4114
+ static std::vector<llama_vocab::id> llama_tokenize_internal(
4115
+ const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
4116
+ );
3978
4117
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
3979
4118
 
3980
4119
  static void llm_load_vocab(
@@ -3996,23 +4135,53 @@ static void llm_load_vocab(
3996
4135
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
3997
4136
 
3998
4137
  // default special tokens
3999
- vocab.special_bos_id = -1;
4000
- vocab.special_eos_id = -1;
4001
- vocab.special_unk_id = -1;
4002
- vocab.special_sep_id = -1;
4003
- vocab.special_pad_id = -1;
4004
- vocab.linefeed_id = -1;
4138
+ vocab.special_bos_id = -1;
4139
+ vocab.special_eos_id = -1;
4140
+ vocab.special_unk_id = -1;
4141
+ vocab.special_sep_id = -1;
4142
+ vocab.special_pad_id = -1;
4143
+ vocab.special_cls_id = -1;
4144
+ vocab.special_mask_id = -1;
4145
+ vocab.linefeed_id = -1;
4005
4146
 
4006
4147
  return;
4007
4148
  } else if (tokenizer_name == "llama") {
4008
4149
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4009
4150
 
4010
4151
  // default special tokens
4011
- vocab.special_bos_id = 1;
4012
- vocab.special_eos_id = 2;
4013
- vocab.special_unk_id = 0;
4014
- vocab.special_sep_id = -1;
4015
- vocab.special_pad_id = -1;
4152
+ vocab.special_bos_id = 1;
4153
+ vocab.special_eos_id = 2;
4154
+ vocab.special_unk_id = 0;
4155
+ vocab.special_sep_id = -1;
4156
+ vocab.special_pad_id = -1;
4157
+ vocab.special_cls_id = -1;
4158
+ vocab.special_mask_id = -1;
4159
+
4160
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4161
+ // prior to support of FIM special tokens in GGUF, the following
4162
+ // will allow those models to continue to work. The general names
4163
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4164
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4165
+ // new versions of these models have been published.
4166
+ std::string gen_name;
4167
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4168
+
4169
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4170
+ [](unsigned char c){ return std::tolower(c); });
4171
+
4172
+ if (gen_name.find("code") != std::string::npos) {
4173
+ if (model.arch == LLM_ARCH_LLAMA) {
4174
+ vocab.special_prefix_id = 32007;
4175
+ vocab.special_suffix_id = 32008;
4176
+ vocab.special_middle_id = 32009;
4177
+ vocab.special_eot_id = 32010;
4178
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4179
+ vocab.special_prefix_id = 67;
4180
+ vocab.special_suffix_id = 69;
4181
+ vocab.special_middle_id = 68;
4182
+ vocab.special_eot_id = 70;
4183
+ }
4184
+ }
4016
4185
 
4017
4186
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4018
4187
  if (add_space_prefix_keyidx != -1) {
@@ -4047,20 +4216,24 @@ static void llm_load_vocab(
4047
4216
  }
4048
4217
 
4049
4218
  // default special tokens
4050
- vocab.special_bos_id = 11;
4051
- vocab.special_eos_id = 11;
4052
- vocab.special_unk_id = -1;
4053
- vocab.special_sep_id = -1;
4054
- vocab.special_pad_id = -1;
4219
+ vocab.special_bos_id = 11;
4220
+ vocab.special_eos_id = 11;
4221
+ vocab.special_unk_id = -1;
4222
+ vocab.special_sep_id = -1;
4223
+ vocab.special_pad_id = -1;
4224
+ vocab.special_cls_id = -1;
4225
+ vocab.special_mask_id = -1;
4055
4226
  } else if (tokenizer_name == "bert") {
4056
4227
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
4057
4228
 
4058
4229
  // default special tokens
4059
- vocab.special_bos_id = 101;
4060
- vocab.special_eos_id = 102;
4061
- vocab.special_unk_id = 100;
4062
- vocab.special_sep_id = -1;
4063
- vocab.special_pad_id = -1;
4230
+ vocab.special_bos_id = -1;
4231
+ vocab.special_eos_id = -1;
4232
+ vocab.special_unk_id = 100;
4233
+ vocab.special_sep_id = 102;
4234
+ vocab.special_pad_id = 0;
4235
+ vocab.special_cls_id = 101;
4236
+ vocab.special_mask_id = 103;
4064
4237
  vocab.add_space_prefix = false;
4065
4238
  } else {
4066
4239
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
@@ -4123,11 +4296,17 @@ static void llm_load_vocab(
4123
4296
  // special tokens
4124
4297
  {
4125
4298
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4126
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4127
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4128
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4129
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4130
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4299
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4300
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4301
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4302
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4303
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4304
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4305
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4306
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4307
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4308
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4131
4310
  };
4132
4311
  for (const auto & it : special_token_types) {
4133
4312
  const std::string & key = kv(std::get<0>(it));
@@ -4319,12 +4498,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4319
4498
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4320
4499
 
4321
4500
  // special tokens
4322
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4323
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4324
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4325
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4326
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4327
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4501
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4328
4509
  }
4329
4510
 
4330
4511
  // Returns false if cancelled by progress_callback
@@ -4342,6 +4523,13 @@ static bool llm_load_tensors(
4342
4523
 
4343
4524
  auto & hparams = model.hparams;
4344
4525
 
4526
+ #ifdef GGML_USE_SYCL
4527
+ // disable MoE with SYCL until mul_mat_id is updated
4528
+ if (hparams.n_expert > 0) {
4529
+ n_gpu_layers = 0;
4530
+ }
4531
+ #endif
4532
+
4345
4533
  model.split_mode = split_mode;
4346
4534
  model.main_gpu = main_gpu;
4347
4535
  model.n_gpu_layers = n_gpu_layers;
@@ -4439,7 +4627,7 @@ static bool llm_load_tensors(
4439
4627
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
4628
 
4441
4629
  // for moe merged tensors
4442
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4630
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4443
4631
 
4444
4632
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4445
4633
  for (auto & it : buft_layer_count) {
@@ -4635,6 +4823,39 @@ static bool llm_load_tensors(
4635
4823
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4636
4824
  }
4637
4825
  } break;
4826
+ case LLM_ARCH_DBRX:
4827
+ {
4828
+ if (n_expert == 0) {
4829
+ throw std::runtime_error("DBRX model cannot have zero experts");
4830
+ }
4831
+
4832
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+
4834
+ // output
4835
+ {
4836
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4837
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4838
+ }
4839
+
4840
+ for (int i = 0; i < n_layer; ++i) {
4841
+ ggml_context * ctx_layer = ctx_for_layer(i);
4842
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4843
+
4844
+ auto & layer = model.layers[i];
4845
+
4846
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4847
+
4848
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4849
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4850
+
4851
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4852
+
4853
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4854
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4855
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4856
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4857
+ }
4858
+ } break;
4638
4859
  case LLM_ARCH_BAICHUAN:
4639
4860
  {
4640
4861
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4949,8 +5170,13 @@ static bool llm_load_tensors(
4949
5170
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4950
5171
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4951
5172
 
4952
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4953
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5173
+ // optional q and k layernorms, present in StableLM 2 12B
5174
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5175
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5176
+
5177
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5178
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5179
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4954
5180
 
4955
5181
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4956
5182
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -4993,7 +5219,13 @@ static bool llm_load_tensors(
4993
5219
  // output
4994
5220
  {
4995
5221
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4996
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5222
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5223
+ // if output is NULL, init from the input tok embed
5224
+ if (model.output == NULL) {
5225
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5226
+ ml.n_created--; // artificial tensor
5227
+ ml.size_data += ggml_nbytes(model.output);
5228
+ }
4997
5229
  }
4998
5230
 
4999
5231
  for (int i = 0; i < n_layer; ++i) {
@@ -5021,6 +5253,54 @@ static bool llm_load_tensors(
5021
5253
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5022
5254
  }
5023
5255
  } break;
5256
+ case LLM_ARCH_QWEN2MOE:
5257
+ {
5258
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5259
+
5260
+ // output
5261
+ {
5262
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5263
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5264
+ }
5265
+
5266
+ for (int i = 0; i < n_layer; ++i) {
5267
+ ggml_context * ctx_layer = ctx_for_layer(i);
5268
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5269
+
5270
+ auto & layer = model.layers[i];
5271
+
5272
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5273
+
5274
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5275
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5276
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5277
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5278
+
5279
+ // optional bias tensors
5280
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5281
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5282
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5283
+
5284
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5285
+
5286
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5287
+
5288
+ GGML_ASSERT(hparams.n_expert > 0);
5289
+ GGML_ASSERT(hparams.n_expert_used > 0);
5290
+
5291
+ // MoE branch
5292
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5293
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5294
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5295
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5296
+
5297
+ // Shared expert branch
5298
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5299
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5300
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5301
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5302
+ }
5303
+ } break;
5024
5304
  case LLM_ARCH_PHI2:
5025
5305
  {
5026
5306
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5404,11 +5684,47 @@ static bool llm_load_tensors(
5404
5684
 
5405
5685
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
5686
 
5687
+ if (n_layer >= 64){
5688
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
5689
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
5690
+ }
5691
+
5692
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5693
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5694
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5695
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5696
+
5697
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5698
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5699
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5700
+ }
5701
+ } break;
5702
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5703
+ {
5704
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5705
+
5706
+ // output
5707
+ {
5708
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5709
+ // if output is NULL, init from the input tok embed
5710
+ if (model.output == NULL) {
5711
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5712
+ ml.n_created--; // artificial tensor
5713
+ ml.size_data += ggml_nbytes(model.output);
5714
+ }
5715
+ }
5716
+
5717
+ for (int i = 0; i < n_layer; ++i) {
5718
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5719
+
5720
+ auto & layer = model.layers[i];
5721
+
5407
5722
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
5723
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
5724
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5410
5725
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5411
5726
 
5727
+
5412
5728
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5413
5729
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5414
5730
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5849,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
5849
6165
  return cur;
5850
6166
  }
5851
6167
 
6168
+ static struct ggml_tensor * llm_build_moe_ffn(
6169
+ struct ggml_context * ctx,
6170
+ struct ggml_tensor * cur,
6171
+ struct ggml_tensor * gate_inp,
6172
+ struct ggml_tensor * up_exps,
6173
+ struct ggml_tensor * gate_exps,
6174
+ struct ggml_tensor * down_exps,
6175
+ int64_t n_expert,
6176
+ int64_t n_expert_used,
6177
+ llm_ffn_op_type type_op,
6178
+ bool norm_w,
6179
+ const llm_build_cb & cb,
6180
+ int il) {
6181
+ int64_t n_embd = cur->ne[0];
6182
+ int64_t n_tokens = cur->ne[1];
6183
+
6184
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6185
+ cb(logits, "ffn_moe_logits", il);
6186
+
6187
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6188
+ cb(probs, "ffn_moe_probs", il);
6189
+
6190
+ // select experts
6191
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6192
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6193
+ cb(selected_experts, "ffn_moe_topk", il);
6194
+
6195
+ ggml_tensor * weights = ggml_get_rows(ctx,
6196
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6197
+ cb(weights, "ffn_moe_weights", il);
6198
+
6199
+ if (norm_w) {
6200
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6201
+
6202
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6203
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6204
+
6205
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6206
+ cb(weights, "ffn_moe_weights_norm", il);
6207
+
6208
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6209
+ }
6210
+
6211
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6212
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6213
+ cb(up, "ffn_moe_up", il);
6214
+
6215
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6216
+ cb(gate, "ffn_moe_gate", il);
6217
+
6218
+ switch (type_op) {
6219
+ case LLM_FFN_SILU:
6220
+ {
6221
+ gate = ggml_silu(ctx, gate);
6222
+ cb(gate, "ffn_moe_silu", il);
6223
+ } break;
6224
+ case LLM_FFN_GELU:
6225
+ {
6226
+ gate = ggml_gelu(ctx, gate);
6227
+ cb(gate, "ffn_moe_gelu", il);
6228
+ } break;
6229
+ default:
6230
+ GGML_ASSERT(false);
6231
+ }
6232
+
6233
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6234
+ cb(par, "ffn_moe_gate_par", il);
6235
+
6236
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6237
+ cb(experts, "ffn_moe_down", il);
6238
+
6239
+ experts = ggml_mul(ctx, experts, weights);
6240
+
6241
+ // aggregate experts
6242
+ ggml_tensor * moe_out = nullptr;
6243
+ for (int i = 0; i < n_expert_used; ++i) {
6244
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6245
+ experts->nb[2], i*experts->nb[1]);
6246
+
6247
+ if (i == 0) {
6248
+ moe_out = cur_expert;
6249
+ } else {
6250
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6251
+ }
6252
+ }
6253
+
6254
+ if (n_expert_used == 1) {
6255
+ // avoid returning a non-contiguous tensor
6256
+ moe_out = ggml_cont(ctx, moe_out);
6257
+ }
6258
+
6259
+ return moe_out;
6260
+ }
6261
+
5852
6262
  // if max_alibi_bias > 0 then apply ALiBi
5853
6263
  static struct ggml_tensor * llm_build_kqv(
5854
6264
  struct ggml_context * ctx,
@@ -6392,62 +6802,15 @@ struct llm_build_context {
6392
6802
  LLM_NORM_RMS, cb, il);
6393
6803
  cb(cur, "ffn_norm", il);
6394
6804
 
6395
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6396
- cb(logits, "ffn_moe_logits", il);
6397
-
6398
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6399
- cb(probs, "ffn_moe_probs", il);
6400
-
6401
- // select experts
6402
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6403
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6404
-
6405
- ggml_tensor * weights = ggml_get_rows(ctx0,
6406
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6407
- cb(weights, "ffn_moe_weights", il);
6408
-
6409
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6410
-
6411
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6412
- cb(weights_sum, "ffn_moe_weights_sum", il);
6413
-
6414
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6415
- cb(weights, "ffn_moe_weights_norm", il);
6416
-
6417
- // compute expert outputs
6418
- ggml_tensor * moe_out = nullptr;
6419
-
6420
- for (int i = 0; i < n_expert_used; ++i) {
6421
- ggml_tensor * cur_expert;
6422
-
6423
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6424
- cb(cur_up, "ffn_moe_up", il);
6425
-
6426
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6427
- cb(cur_gate, "ffn_moe_gate", il);
6428
-
6429
- cur_gate = ggml_silu(ctx0, cur_gate);
6430
- cb(cur_gate, "ffn_moe_silu", il);
6431
-
6432
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6433
- cb(cur_expert, "ffn_moe_gate_par", il);
6434
-
6435
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6436
- cb(cur_expert, "ffn_moe_down", il);
6437
-
6438
- cur_expert = ggml_mul(ctx0, cur_expert,
6439
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6440
- cb(cur_expert, "ffn_moe_weighted", il);
6441
-
6442
- if (i == 0) {
6443
- moe_out = cur_expert;
6444
- } else {
6445
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6446
- cb(moe_out, "ffn_moe_out", il);
6447
- }
6448
- }
6449
-
6450
- cur = moe_out;
6805
+ cur = llm_build_moe_ffn(ctx0, cur,
6806
+ model.layers[il].ffn_gate_inp,
6807
+ model.layers[il].ffn_up_exps,
6808
+ model.layers[il].ffn_gate_exps,
6809
+ model.layers[il].ffn_down_exps,
6810
+ n_expert, n_expert_used,
6811
+ LLM_FFN_SILU, true,
6812
+ cb, il);
6813
+ cb(cur, "ffn_moe_out", il);
6451
6814
  }
6452
6815
 
6453
6816
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6926,63 +7289,15 @@ struct llm_build_context {
6926
7289
  LLM_NORM_RMS, cb, il);
6927
7290
  cb(cur, "ffn_norm", il);
6928
7291
 
6929
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
- cb(logits, "ffn_moe_logits", il);
6931
-
6932
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
- cb(probs, "ffn_moe_probs", il);
6934
-
6935
- // select experts
6936
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
-
6939
- ggml_tensor * weights = ggml_get_rows(ctx0,
6940
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
- cb(weights, "ffn_moe_weights", il);
6942
-
6943
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
-
6945
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
- cb(weights_sum, "ffn_moe_weights_sum", il);
6947
-
6948
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
- cb(weights, "ffn_moe_weights_norm", il);
6950
-
6951
- // compute expert outputs
6952
- ggml_tensor * moe_out = nullptr;
6953
-
6954
- for (int i = 0; i < n_expert_used; ++i) {
6955
- ggml_tensor * cur_expert;
6956
-
6957
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
- cb(cur_up, "ffn_moe_up", il);
6959
-
6960
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
- cb(cur_gate, "ffn_moe_gate", il);
6962
-
6963
- //GeLU
6964
- cur_gate = ggml_gelu(ctx0, cur_gate);
6965
- cb(cur_gate, "ffn_moe_gelu", il);
6966
-
6967
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
- cb(cur_expert, "ffn_moe_gate_par", il);
6969
-
6970
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
- cb(cur_expert, "ffn_moe_down", il);
6972
-
6973
- cur_expert = ggml_mul(ctx0, cur_expert,
6974
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
- cb(cur_expert, "ffn_moe_weighted", il);
6976
-
6977
- if (i == 0) {
6978
- moe_out = cur_expert;
6979
- } else {
6980
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
- cb(moe_out, "ffn_moe_out", il);
6982
- }
6983
- }
6984
-
6985
- cur = moe_out;
7292
+ cur = llm_build_moe_ffn(ctx0, cur,
7293
+ model.layers[il].ffn_gate_inp,
7294
+ model.layers[il].ffn_up_exps,
7295
+ model.layers[il].ffn_gate_exps,
7296
+ model.layers[il].ffn_down_exps,
7297
+ n_expert, n_expert_used,
7298
+ LLM_FFN_GELU, true,
7299
+ cb, il);
7300
+ cb(cur, "ffn_moe_out", il);
6986
7301
 
6987
7302
  // Grok
6988
7303
  // if layer_out_norm is present then apply it before adding the input
@@ -6994,7 +7309,6 @@ struct llm_build_context {
6994
7309
  cb(cur, "layer_out_norm", il);
6995
7310
  }
6996
7311
 
6997
-
6998
7312
  cur = ggml_add(ctx0, cur, ffn_inp);
6999
7313
  cb(cur, "ffn_out", il);
7000
7314
 
@@ -7030,12 +7344,16 @@ struct llm_build_context {
7030
7344
  return gf;
7031
7345
  }
7032
7346
 
7033
- struct ggml_cgraph * build_starcoder() {
7347
+ struct ggml_cgraph * build_dbrx() {
7034
7348
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7035
7349
 
7350
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7351
+ int32_t n_tokens = this->n_tokens;
7352
+
7036
7353
  const int64_t n_embd_head = hparams.n_embd_head_v;
7037
7354
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7038
7355
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7356
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7039
7357
 
7040
7358
  struct ggml_tensor * cur;
7041
7359
  struct ggml_tensor * inpL;
@@ -7048,16 +7366,140 @@ struct llm_build_context {
7048
7366
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7049
7367
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7050
7368
 
7051
- struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7052
- cb(pos, "pos_embd", -1);
7053
-
7054
- inpL = ggml_add(ctx0, inpL, pos);
7055
- cb(inpL, "inpL", -1);
7056
-
7057
7369
  for (int il = 0; il < n_layer; ++il) {
7370
+ struct ggml_tensor * inpSA = inpL;
7371
+
7372
+ // norm
7058
7373
  cur = llm_build_norm(ctx0, inpL, hparams,
7059
- model.layers[il].attn_norm,
7060
- model.layers[il].attn_norm_b,
7374
+ model.layers[il].attn_norm, NULL,
7375
+ LLM_NORM, cb, il);
7376
+ cb(cur, "attn_norm", il);
7377
+
7378
+ // self-attention
7379
+ {
7380
+ struct ggml_tensor * Qcur = nullptr;
7381
+ struct ggml_tensor * Kcur = nullptr;
7382
+ struct ggml_tensor * Vcur = nullptr;
7383
+
7384
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7385
+ cb(cur, "wqkv", il);
7386
+
7387
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7388
+ cb(cur, "wqkv_clamped", il);
7389
+
7390
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7391
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7392
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7393
+
7394
+ cb(Qcur, "Qcur", il);
7395
+ cb(Kcur, "Kcur", il);
7396
+ cb(Vcur, "Vcur", il);
7397
+
7398
+ Qcur = ggml_rope_custom(
7399
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7401
+ ext_factor, attn_factor, beta_fast, beta_slow
7402
+ );
7403
+ cb(Qcur, "Qcur", il);
7404
+
7405
+ Kcur = ggml_rope_custom(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7407
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7408
+ ext_factor, attn_factor, beta_fast, beta_slow
7409
+ );
7410
+ cb(Kcur, "Kcur", il);
7411
+
7412
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
+ model.layers[il].wo, NULL,
7414
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7415
+ }
7416
+
7417
+ if (il == n_layer - 1) {
7418
+ // skip computing output for unused tokens
7419
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7420
+ n_tokens = n_outputs;
7421
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7422
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7423
+ }
7424
+
7425
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7426
+ cb(ffn_inp, "ffn_inp", il);
7427
+
7428
+ // feed-forward network
7429
+ // MoE branch
7430
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7431
+ model.layers[il].attn_out_norm, NULL,
7432
+ LLM_NORM, cb, il);
7433
+ cb(cur, "attn_out_norm", il);
7434
+
7435
+ cur = llm_build_moe_ffn(ctx0, cur,
7436
+ model.layers[il].ffn_gate_inp,
7437
+ model.layers[il].ffn_up_exps,
7438
+ model.layers[il].ffn_gate_exps,
7439
+ model.layers[il].ffn_down_exps,
7440
+ n_expert, n_expert_used,
7441
+ LLM_FFN_SILU, true,
7442
+ cb, il);
7443
+ cb(cur, "ffn_moe_out", il);
7444
+
7445
+ cur = ggml_add(ctx0, cur, ffn_inp);
7446
+ cb(cur, "ffn_out", il);
7447
+
7448
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7449
+ if (layer_dir != nullptr) {
7450
+ cur = ggml_add(ctx0, cur, layer_dir);
7451
+ }
7452
+ cb(cur, "l_out", il);
7453
+
7454
+ // input for next layer
7455
+ inpL = cur;
7456
+ }
7457
+
7458
+ cur = inpL;
7459
+
7460
+ cur = llm_build_norm(ctx0, cur, hparams,
7461
+ model.output_norm, NULL,
7462
+ LLM_NORM, cb, -1);
7463
+ cb(cur, "result_norm", -1);
7464
+
7465
+ // lm_head
7466
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7467
+
7468
+ cb(cur, "result_output", -1);
7469
+
7470
+ ggml_build_forward_expand(gf, cur);
7471
+
7472
+ return gf;
7473
+ }
7474
+
7475
+ struct ggml_cgraph * build_starcoder() {
7476
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7477
+
7478
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7479
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7480
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7481
+
7482
+ struct ggml_tensor * cur;
7483
+ struct ggml_tensor * inpL;
7484
+
7485
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7486
+
7487
+ // inp_pos - contains the positions
7488
+ struct ggml_tensor * inp_pos = build_inp_pos();
7489
+
7490
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7491
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7492
+
7493
+ struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7494
+ cb(pos, "pos_embd", -1);
7495
+
7496
+ inpL = ggml_add(ctx0, inpL, pos);
7497
+ cb(inpL, "inpL", -1);
7498
+
7499
+ for (int il = 0; il < n_layer; ++il) {
7500
+ cur = llm_build_norm(ctx0, inpL, hparams,
7501
+ model.layers[il].attn_norm,
7502
+ model.layers[il].attn_norm_b,
7061
7503
  LLM_NORM, cb, il);
7062
7504
  cb(cur, "attn_norm", il);
7063
7505
 
@@ -7882,7 +8324,7 @@ struct llm_build_context {
7882
8324
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7883
8325
 
7884
8326
  for (int il = 0; il < n_layer; ++il) {
7885
- struct ggml_tensor * inpSA = inpL;
8327
+
7886
8328
 
7887
8329
  // norm
7888
8330
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7891,6 +8333,8 @@ struct llm_build_context {
7891
8333
  LLM_NORM, cb, il);
7892
8334
  cb(cur, "attn_norm", il);
7893
8335
 
8336
+ struct ggml_tensor * inpSA = cur;
8337
+
7894
8338
  // self-attention
7895
8339
  {
7896
8340
  // compute Q and K and RoPE them
@@ -7915,15 +8359,36 @@ struct llm_build_context {
7915
8359
  cb(Vcur, "Vcur", il);
7916
8360
  }
7917
8361
 
8362
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8363
+ cb(Qcur, "Qcur", il);
8364
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8365
+ cb(Kcur, "Kcur", il);
8366
+
8367
+ if (model.layers[il].attn_q_norm) {
8368
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8369
+ model.layers[il].attn_q_norm,
8370
+ NULL,
8371
+ LLM_NORM, cb, il);
8372
+ cb(Qcur, "Qcur", il);
8373
+ }
8374
+ if (model.layers[il].attn_k_norm) {
8375
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8376
+ model.layers[il].attn_k_norm,
8377
+ NULL,
8378
+ LLM_NORM, cb, il);
8379
+ cb(Kcur, "Kcur", il);
8380
+ }
8381
+
8382
+
7918
8383
  Qcur = ggml_rope_custom(
7919
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8384
+ ctx0, Qcur, inp_pos,
7920
8385
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7921
8386
  ext_factor, attn_factor, beta_fast, beta_slow
7922
8387
  );
7923
8388
  cb(Qcur, "Qcur", il);
7924
8389
 
7925
8390
  Kcur = ggml_rope_custom(
7926
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8391
+ ctx0, Kcur, inp_pos,
7927
8392
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7928
8393
  ext_factor, attn_factor, beta_fast, beta_slow
7929
8394
  );
@@ -7938,20 +8403,25 @@ struct llm_build_context {
7938
8403
  // skip computing output for unused tokens
7939
8404
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
8405
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8406
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7941
8407
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
8408
  }
7943
8409
 
7944
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8410
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7945
8411
  cb(ffn_inp, "ffn_inp", il);
7946
8412
 
7947
8413
  // feed-forward network
7948
8414
  {
7949
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7950
- model.layers[il].ffn_norm,
7951
- model.layers[il].ffn_norm_b,
7952
- LLM_NORM, cb, il);
7953
- cb(cur, "ffn_norm", il);
7954
-
8415
+ if (model.layers[il].ffn_norm) {
8416
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8417
+ model.layers[il].ffn_norm,
8418
+ model.layers[il].ffn_norm_b,
8419
+ LLM_NORM, cb, il);
8420
+ cb(cur, "ffn_norm", il);
8421
+ } else {
8422
+ // parallel residual
8423
+ cur = inpSA;
8424
+ }
7955
8425
  cur = llm_build_ffn(ctx0, cur,
7956
8426
  model.layers[il].ffn_up, NULL,
7957
8427
  model.layers[il].ffn_gate, NULL,
@@ -8141,12 +8611,6 @@ struct llm_build_context {
8141
8611
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8142
8612
  cb(Vcur, "Vcur", il);
8143
8613
 
8144
- // these nodes are added to the graph together so that they are not reordered
8145
- // by doing so, the number of splits in the graph is reduced
8146
- ggml_build_forward_expand(gf, Qcur);
8147
- ggml_build_forward_expand(gf, Kcur);
8148
- ggml_build_forward_expand(gf, Vcur);
8149
-
8150
8614
  Qcur = ggml_rope_custom(
8151
8615
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8152
8616
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8213,6 +8677,150 @@ struct llm_build_context {
8213
8677
  return gf;
8214
8678
  }
8215
8679
 
8680
+ struct ggml_cgraph * build_qwen2moe() {
8681
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8682
+
8683
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8684
+ int32_t n_tokens = this->n_tokens;
8685
+
8686
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8687
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8688
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8689
+
8690
+ struct ggml_tensor * cur;
8691
+ struct ggml_tensor * inpL;
8692
+
8693
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8694
+
8695
+ // inp_pos - contains the positions
8696
+ struct ggml_tensor * inp_pos = build_inp_pos();
8697
+
8698
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8699
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8700
+
8701
+ for (int il = 0; il < n_layer; ++il) {
8702
+ struct ggml_tensor * inpSA = inpL;
8703
+
8704
+ // norm
8705
+ cur = llm_build_norm(ctx0, inpL, hparams,
8706
+ model.layers[il].attn_norm, NULL,
8707
+ LLM_NORM_RMS, cb, il);
8708
+ cb(cur, "attn_norm", il);
8709
+
8710
+ // self_attention
8711
+ {
8712
+ // compute Q and K and RoPE them
8713
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8714
+ cb(Qcur, "Qcur", il);
8715
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8716
+ cb(Qcur, "Qcur", il);
8717
+
8718
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8719
+ cb(Kcur, "Kcur", il);
8720
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8721
+ cb(Kcur, "Kcur", il);
8722
+
8723
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8724
+ cb(Vcur, "Vcur", il);
8725
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8726
+ cb(Vcur, "Vcur", il);
8727
+
8728
+ Qcur = ggml_rope_custom(
8729
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8730
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8731
+ ext_factor, attn_factor, beta_fast, beta_slow
8732
+ );
8733
+ cb(Qcur, "Qcur", il);
8734
+
8735
+ Kcur = ggml_rope_custom(
8736
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8737
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8738
+ ext_factor, attn_factor, beta_fast, beta_slow
8739
+ );
8740
+ cb(Kcur, "Kcur", il);
8741
+
8742
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8743
+ model.layers[il].wo, model.layers[il].bo,
8744
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
+ }
8746
+
8747
+ if (il == n_layer - 1) {
8748
+ // skip computing output for unused tokens
8749
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8750
+ n_tokens = n_outputs;
8751
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8752
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8753
+ }
8754
+
8755
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8756
+ cb(ffn_inp, "ffn_inp", il);
8757
+
8758
+ // MoE branch
8759
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8760
+ model.layers[il].ffn_norm, NULL,
8761
+ LLM_NORM_RMS, cb, il);
8762
+ cb(cur, "ffn_norm", il);
8763
+
8764
+ ggml_tensor * moe_out =
8765
+ llm_build_moe_ffn(ctx0, cur,
8766
+ model.layers[il].ffn_gate_inp,
8767
+ model.layers[il].ffn_up_exps,
8768
+ model.layers[il].ffn_gate_exps,
8769
+ model.layers[il].ffn_down_exps,
8770
+ n_expert, n_expert_used,
8771
+ LLM_FFN_SILU, false,
8772
+ cb, il);
8773
+ cb(cur, "ffn_moe_out", il);
8774
+
8775
+ // FFN shared expert
8776
+ {
8777
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8778
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8779
+
8780
+ // sigmoid
8781
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8782
+ cb(cur_gate, "ffn_shexp_gate", il);
8783
+
8784
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8785
+ model.layers[il].ffn_up_shexp, NULL,
8786
+ model.layers[il].ffn_gate_shexp, NULL,
8787
+ model.layers[il].ffn_down_shexp, NULL,
8788
+ NULL,
8789
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8790
+ cb(cur_ffn, "ffn_shexp", il);
8791
+
8792
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8793
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8794
+
8795
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8796
+ cb(moe_out, "ffn_out", il);
8797
+
8798
+ cur = moe_out;
8799
+ }
8800
+
8801
+ cur = ggml_add(ctx0, cur, ffn_inp);
8802
+ cb(cur, "l_out", il);
8803
+
8804
+ // input for next layer
8805
+ inpL = cur;
8806
+ }
8807
+
8808
+ cur = inpL;
8809
+
8810
+ cur = llm_build_norm(ctx0, cur, hparams,
8811
+ model.output_norm, NULL,
8812
+ LLM_NORM_RMS, cb, -1);
8813
+ cb(cur, "result_norm", -1);
8814
+
8815
+ // lm_head
8816
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8817
+ cb(cur, "result_output", -1);
8818
+
8819
+ ggml_build_forward_expand(gf, cur);
8820
+
8821
+ return gf;
8822
+ }
8823
+
8216
8824
  struct ggml_cgraph * build_phi2() {
8217
8825
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8218
8826
 
@@ -9452,6 +10060,31 @@ struct llm_build_context {
9452
10060
  cb(Vcur, "Vcur", il);
9453
10061
  }
9454
10062
 
10063
+ if (model.layers[il].attn_q_norm) {
10064
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
10065
+ ggml_element_size(Qcur) * n_embd_head,
10066
+ ggml_element_size(Qcur) * n_embd_head * n_head,
10067
+ 0);
10068
+ cb(Qcur, "Qcur", il);
10069
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
10070
+ ggml_element_size(Kcur) * n_embd_head,
10071
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
10072
+ 0);
10073
+ cb(Kcur, "Kcur", il);
10074
+
10075
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
10076
+ model.layers[il].attn_q_norm,
10077
+ NULL,
10078
+ LLM_NORM, cb, il);
10079
+ cb(Qcur, "Qcur", il);
10080
+
10081
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
10082
+ model.layers[il].attn_k_norm,
10083
+ NULL,
10084
+ LLM_NORM, cb, il);
10085
+ cb(Kcur, "Kcur", il);
10086
+ }
10087
+
9455
10088
  Qcur = ggml_rope_custom(
9456
10089
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
10090
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -9522,6 +10155,139 @@ struct llm_build_context {
9522
10155
  return gf;
9523
10156
 
9524
10157
  }
10158
+
10159
+ // ref: https://allenai.org/olmo
10160
+ // based on the original build_llama() function, changes:
10161
+ // * non-parametric layer norm
10162
+ // * clamp qkv
10163
+ // * removed bias
10164
+ // * removed MoE
10165
+ struct ggml_cgraph * build_olmo() {
10166
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10167
+
10168
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10169
+ int32_t n_tokens = this->n_tokens;
10170
+
10171
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10172
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10173
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10174
+
10175
+ struct ggml_tensor * cur;
10176
+ struct ggml_tensor * inpL;
10177
+
10178
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10179
+
10180
+ // inp_pos - contains the positions
10181
+ struct ggml_tensor * inp_pos = build_inp_pos();
10182
+
10183
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10184
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10185
+
10186
+ for (int il = 0; il < n_layer; ++il) {
10187
+ struct ggml_tensor * inpSA = inpL;
10188
+
10189
+ // norm
10190
+ cur = llm_build_norm(ctx0, inpL, hparams,
10191
+ NULL, NULL,
10192
+ LLM_NORM, cb, il);
10193
+ cb(cur, "attn_norm", il);
10194
+
10195
+ // self-attention
10196
+ {
10197
+ // compute Q and K and RoPE them
10198
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10199
+ cb(Qcur, "Qcur", il);
10200
+ if (hparams.f_clamp_kqv > 0.0f) {
10201
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10202
+ cb(Qcur, "Qcur", il);
10203
+ }
10204
+
10205
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10206
+ cb(Kcur, "Kcur", il);
10207
+ if (hparams.f_clamp_kqv > 0.0f) {
10208
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10209
+ cb(Kcur, "Kcur", il);
10210
+ }
10211
+
10212
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10213
+ cb(Vcur, "Vcur", il);
10214
+ if (hparams.f_clamp_kqv > 0.0f) {
10215
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10216
+ cb(Vcur, "Vcur", il);
10217
+ }
10218
+
10219
+ Qcur = ggml_rope_custom(
10220
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10221
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10222
+ ext_factor, attn_factor, beta_fast, beta_slow
10223
+ );
10224
+ cb(Qcur, "Qcur", il);
10225
+
10226
+ Kcur = ggml_rope_custom(
10227
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10228
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10229
+ ext_factor, attn_factor, beta_fast, beta_slow
10230
+ );
10231
+ cb(Kcur, "Kcur", il);
10232
+
10233
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10234
+ model.layers[il].wo, nullptr,
10235
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
+ }
10237
+
10238
+ if (il == n_layer - 1) {
10239
+ // skip computing output for unused tokens
10240
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10241
+ n_tokens = n_outputs;
10242
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10243
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10244
+ }
10245
+
10246
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10247
+ cb(ffn_inp, "ffn_inp", il);
10248
+
10249
+ // feed-forward network
10250
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10251
+ NULL, NULL,
10252
+ LLM_NORM, cb, il);
10253
+ cb(cur, "ffn_norm", il);
10254
+
10255
+ cur = llm_build_ffn(ctx0, cur,
10256
+ model.layers[il].ffn_up, NULL,
10257
+ model.layers[il].ffn_gate, NULL,
10258
+ model.layers[il].ffn_down, NULL,
10259
+ NULL,
10260
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10261
+ cb(cur, "ffn_out", il);
10262
+
10263
+ cur = ggml_add(ctx0, cur, ffn_inp);
10264
+ cb(cur, "ffn_out", il);
10265
+
10266
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10267
+ if (layer_dir != nullptr) {
10268
+ cur = ggml_add(ctx0, cur, layer_dir);
10269
+ }
10270
+ cb(cur, "l_out", il);
10271
+
10272
+ // input for next layer
10273
+ inpL = cur;
10274
+ }
10275
+
10276
+ cur = inpL;
10277
+
10278
+ cur = llm_build_norm(ctx0, cur, hparams,
10279
+ NULL, NULL,
10280
+ LLM_NORM, cb, -1);
10281
+ cb(cur, "result_norm", -1);
10282
+
10283
+ // lm_head
10284
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10285
+ cb(cur, "result_output", -1);
10286
+
10287
+ ggml_build_forward_expand(gf, cur);
10288
+
10289
+ return gf;
10290
+ }
9525
10291
  };
9526
10292
 
9527
10293
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9671,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
9671
10437
  {
9672
10438
  result = llm.build_qwen2();
9673
10439
  } break;
10440
+ case LLM_ARCH_QWEN2MOE:
10441
+ {
10442
+ result = llm.build_qwen2moe();
10443
+ } break;
9674
10444
  case LLM_ARCH_PHI2:
9675
10445
  {
9676
10446
  result = llm.build_phi2();
@@ -9715,9 +10485,17 @@ static struct ggml_cgraph * llama_build_graph(
9715
10485
  {
9716
10486
  result = llm.build_xverse();
9717
10487
  } break;
9718
- case LLM_ARCH_COMMAND_R:
10488
+ case LLM_ARCH_COMMAND_R:
10489
+ {
10490
+ result = llm.build_command_r();
10491
+ } break;
10492
+ case LLM_ARCH_DBRX:
10493
+ {
10494
+ result = llm.build_dbrx();
10495
+ } break;
10496
+ case LLM_ARCH_OLMO:
9719
10497
  {
9720
- result = llm.build_command_r();
10498
+ result = llm.build_olmo();
9721
10499
  } break;
9722
10500
  default:
9723
10501
  GGML_ASSERT(false);
@@ -10409,6 +11187,9 @@ static int llama_decode_internal(
10409
11187
  n_outputs_prev += lctx.n_outputs;
10410
11188
  }
10411
11189
 
11190
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
11191
+ lctx.n_outputs = n_outputs;
11192
+
10412
11193
  // wait for the computation to finish (automatically done when obtaining the model output)
10413
11194
  //llama_synchronize(&lctx);
10414
11195
 
@@ -11052,7 +11833,7 @@ struct llm_tokenizer_bpe {
11052
11833
  add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
11053
11834
  }
11054
11835
 
11055
- // add the fnished tokens to the final list keeping correct order for next and prev
11836
+ // add the finished tokens to the final list keeping correct order for next and prev
11056
11837
  for (auto & sym : symbols) {
11057
11838
  if (sym.n > 0) {
11058
11839
  sym.prev = final_prev_index;
@@ -11321,9 +12102,6 @@ struct llm_tokenizer_wpm {
11321
12102
  output.push_back(vocab.special_unk_id);
11322
12103
  }
11323
12104
  }
11324
-
11325
- // append eos token
11326
- output.push_back(vocab.special_eos_id);
11327
12105
  }
11328
12106
 
11329
12107
  std::vector<std::string> preprocess(const std::string & text) {
@@ -11528,30 +12306,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
11528
12306
  }
11529
12307
  }
11530
12308
 
11531
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
12309
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
11532
12310
  std::vector<llama_vocab::id> output;
11533
-
11534
- // OG tokenizer behavior:
11535
- //
11536
- // tokenizer.encode('', add_bos=True) returns [1]
11537
- // tokenizer.encode('', add_bos=False) returns []
11538
-
11539
- if (bos && vocab.special_bos_id != -1) {
11540
- output.push_back(vocab.special_bos_id);
11541
- }
11542
-
11543
- if (raw_text.empty()) {
11544
- return output;
11545
- }
11546
-
11547
12311
  std::forward_list<fragment_buffer_variant> fragment_buffer;
11548
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11549
12312
 
11550
- if (special) tokenizer_st_partition(vocab, fragment_buffer);
12313
+ if (!raw_text.empty()) {
12314
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
12315
+ if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
12316
+ }
11551
12317
 
11552
12318
  switch (vocab.type) {
11553
12319
  case LLAMA_VOCAB_TYPE_SPM:
11554
12320
  {
12321
+ // OG tokenizer behavior:
12322
+ //
12323
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
12324
+ // tokenizer.encode('', add_special_tokens=False) returns []
12325
+
12326
+ if (add_special && vocab.special_add_bos != 0) {
12327
+ GGML_ASSERT(vocab.special_bos_id != -1);
12328
+ output.push_back(vocab.special_bos_id);
12329
+ }
12330
+
11555
12331
  for (const auto & fragment : fragment_buffer) {
11556
12332
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11557
12333
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -11577,9 +12353,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11577
12353
  output.push_back(fragment.token);
11578
12354
  }
11579
12355
  }
12356
+
12357
+ if (add_special && vocab.special_add_eos == 1) {
12358
+ GGML_ASSERT(vocab.special_eos_id != -1);
12359
+ output.push_back(vocab.special_eos_id);
12360
+ }
11580
12361
  } break;
11581
12362
  case LLAMA_VOCAB_TYPE_BPE:
11582
12363
  {
12364
+ if (add_special && vocab.special_add_bos == 1) {
12365
+ GGML_ASSERT(vocab.special_bos_id != -1);
12366
+ output.push_back(vocab.special_bos_id);
12367
+ }
12368
+
11583
12369
  for (const auto & fragment : fragment_buffer) {
11584
12370
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11585
12371
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11593,9 +12379,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11593
12379
  output.push_back(fragment.token);
11594
12380
  }
11595
12381
  }
12382
+
12383
+ GGML_ASSERT(vocab.special_add_eos != 1);
11596
12384
  } break;
11597
12385
  case LLAMA_VOCAB_TYPE_WPM:
11598
12386
  {
12387
+ if (add_special) {
12388
+ GGML_ASSERT(vocab.special_cls_id != -1);
12389
+ output.push_back(vocab.special_cls_id);
12390
+ }
12391
+
11599
12392
  for (const auto & fragment : fragment_buffer) {
11600
12393
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11601
12394
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11609,6 +12402,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11609
12402
  output.push_back(fragment.token);
11610
12403
  }
11611
12404
  }
12405
+
12406
+ if (add_special) {
12407
+ GGML_ASSERT(vocab.special_sep_id != -1);
12408
+ output.push_back(vocab.special_sep_id);
12409
+ }
11612
12410
  } break;
11613
12411
  case LLAMA_VOCAB_TYPE_NONE:
11614
12412
  GGML_ASSERT(false);
@@ -11775,7 +12573,9 @@ static void llama_grammar_advance_stack(
11775
12573
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11776
12574
 
11777
12575
  if (stack.empty()) {
11778
- new_stacks.emplace_back(stack);
12576
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
12577
+ new_stacks.emplace_back(stack);
12578
+ }
11779
12579
  return;
11780
12580
  }
11781
12581
 
@@ -11812,7 +12612,10 @@ static void llama_grammar_advance_stack(
11812
12612
  }
11813
12613
  case LLAMA_GRETYPE_CHAR:
11814
12614
  case LLAMA_GRETYPE_CHAR_NOT:
11815
- new_stacks.emplace_back(stack);
12615
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
12616
+ // only add the stack if it's not a duplicate of one we already have
12617
+ new_stacks.emplace_back(stack);
12618
+ }
11816
12619
  break;
11817
12620
  default:
11818
12621
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -11826,12 +12629,13 @@ static void llama_grammar_advance_stack(
11826
12629
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11827
12630
  // produces the N possible stacks if the given char is accepted at those
11828
12631
  // positions
11829
- std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
12632
+ void llama_grammar_accept(
11830
12633
  const std::vector<std::vector<llama_grammar_element>> & rules,
11831
12634
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11832
- const uint32_t chr) {
12635
+ const uint32_t chr,
12636
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11833
12637
 
11834
- std::vector<std::vector<const llama_grammar_element *>> new_stacks;
12638
+ new_stacks.clear();
11835
12639
 
11836
12640
  for (const auto & stack : stacks) {
11837
12641
  if (stack.empty()) {
@@ -11850,8 +12654,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11850
12654
  llama_grammar_advance_stack(rules, new_stack, new_stacks);
11851
12655
  }
11852
12656
  }
11853
-
11854
- return new_stacks;
11855
12657
  }
11856
12658
 
11857
12659
  static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@@ -11865,6 +12667,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11865
12667
  const std::vector<llama_grammar_candidate> & candidates) {
11866
12668
 
11867
12669
  std::vector<llama_grammar_candidate> rejects;
12670
+ rejects.reserve(candidates.size());
11868
12671
 
11869
12672
  if (stack.empty()) {
11870
12673
  for (const auto & tok : candidates) {
@@ -11878,6 +12681,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11878
12681
  const llama_grammar_element * stack_pos = stack.back();
11879
12682
 
11880
12683
  std::vector<llama_grammar_candidate> next_candidates;
12684
+ next_candidates.reserve(candidates.size());
12685
+
11881
12686
  for (const auto & tok : candidates) {
11882
12687
  if (*tok.code_points == 0) {
11883
12688
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -12685,8 +13490,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12685
13490
  // Note terminating 0 in decoded string
12686
13491
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
12687
13492
  const auto & code_points = decoded.first;
13493
+ std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
12688
13494
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
12689
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
13495
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
13496
+ grammar->stacks = tmp_new_stacks;
12690
13497
  }
12691
13498
  grammar->partial_utf8 = decoded.second;
12692
13499
  GGML_ASSERT(!grammar->stacks.empty());
@@ -12820,6 +13627,11 @@ struct llama_beam_search_data {
12820
13627
  }
12821
13628
  llama_logit_info logit_info(ctx);
12822
13629
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13630
+
13631
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13632
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13633
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13634
+
12823
13635
  size_t i=0;
12824
13636
  if (next_beams.size() < n_beams) {
12825
13637
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13318,9 +14130,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
13318
14130
  return new_type;
13319
14131
  }
13320
14132
 
13321
- static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14133
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13322
14134
  std::mutex mutex;
13323
- int counter = 0;
14135
+ int64_t counter = 0;
13324
14136
  size_t new_size = 0;
13325
14137
  if (nthread < 2) {
13326
14138
  // single-thread
@@ -13328,11 +14140,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13328
14140
  }
13329
14141
  auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
13330
14142
  nrows, n_per_row, imatrix]() {
13331
- const int nrows_per_chunk = chunk_size / n_per_row;
14143
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
13332
14144
  size_t local_size = 0;
13333
14145
  while (true) {
13334
14146
  std::unique_lock<std::mutex> lock(mutex);
13335
- int first_row = counter; counter += nrows_per_chunk;
14147
+ int64_t first_row = counter; counter += nrows_per_chunk;
13336
14148
  if (first_row >= nrows) {
13337
14149
  if (local_size > 0) {
13338
14150
  new_size += local_size;
@@ -13340,7 +14152,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13340
14152
  break;
13341
14153
  }
13342
14154
  lock.unlock();
13343
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14155
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13344
14156
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
13345
14157
  }
13346
14158
  };
@@ -13440,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13440
14252
  gguf_set_kv (ctx_out, ml.meta);
13441
14253
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13442
14254
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14255
+ // Remove split metadata
14256
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14257
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14258
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13443
14259
 
13444
14260
  if (params->kv_overrides) {
13445
14261
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -13463,7 +14279,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13463
14279
  const std::string name = ggml_get_name(meta);
13464
14280
 
13465
14281
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13466
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
14282
+ if (name.find("attn_v.weight") != std::string::npos ||
14283
+ name.find("attn_qkv.weight") != std::string::npos) {
13467
14284
  ++qs.n_attention_wv;
13468
14285
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13469
14286
  qs.has_output = true;
@@ -13473,7 +14290,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13473
14290
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
14291
 
13475
14292
  // sanity checks
13476
- GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
14293
+ //
14294
+ // - qs.n_attention_wv == 0 for Mamba models
14295
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
14296
+ //
14297
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
13477
14298
 
13478
14299
  size_t total_size_org = 0;
13479
14300
  size_t total_size_new = 0;
@@ -13529,6 +14350,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13529
14350
 
13530
14351
  // quantize only 2D and 3D tensors (experts)
13531
14352
  quantize &= (ggml_n_dims(tensor) >= 2);
14353
+
14354
+ // do not quantize norm tensors
14355
+ quantize &= name.find("_norm.weight") == std::string::npos;
14356
+
13532
14357
  quantize &= params->quantize_output_tensor || name != "output.weight";
13533
14358
  quantize &= !params->only_copy;
13534
14359
 
@@ -13557,10 +14382,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13557
14382
  if (!params->pure && ggml_is_quantized(default_type)) {
13558
14383
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
13559
14384
  }
13560
- else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
14385
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
14386
  new_type = params->token_embedding_type;
13562
14387
  }
13563
- else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
14388
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
14389
  new_type = params->output_tensor_type;
13565
14390
  }
13566
14391
 
@@ -13575,7 +14400,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13575
14400
  new_size = ggml_nbytes(tensor);
13576
14401
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
13577
14402
  } else {
13578
- const size_t nelements = ggml_nelements(tensor);
14403
+ const int64_t nelements = ggml_nelements(tensor);
13579
14404
 
13580
14405
  const float * imatrix = nullptr;
13581
14406
  if (imatrix_data) {
@@ -13627,20 +14452,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13627
14452
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
13628
14453
  fflush(stdout);
13629
14454
 
13630
- if (work.size() < nelements * 4) {
14455
+ if (work.size() < (size_t)nelements * 4) {
13631
14456
  work.resize(nelements * 4); // upper bound on size
13632
14457
  }
13633
14458
  new_data = work.data();
13634
14459
 
13635
- const int n_per_row = tensor->ne[0];
13636
- const int nrows = tensor->ne[1];
14460
+ const int64_t n_per_row = tensor->ne[0];
14461
+ const int64_t nrows = tensor->ne[1];
13637
14462
 
13638
- static const int min_chunk_size = 32 * 512;
13639
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
14463
+ static const int64_t min_chunk_size = 32 * 512;
14464
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13640
14465
 
13641
- const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
- const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13643
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
14466
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
14467
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
14468
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
13644
14469
 
13645
14470
  // quantize each expert separately since they have different importance matrices
13646
14471
  new_size = 0;
@@ -14525,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14525
15350
  case LLM_ARCH_MINICPM:
14526
15351
  case LLM_ARCH_XVERSE:
14527
15352
  case LLM_ARCH_COMMAND_R:
15353
+ case LLM_ARCH_OLMO:
14528
15354
  return LLAMA_ROPE_TYPE_NORM;
14529
15355
 
14530
15356
  // the pairs of head values are offset by n_rot/2
14531
15357
  case LLM_ARCH_FALCON:
14532
15358
  case LLM_ARCH_GROK:
15359
+ case LLM_ARCH_DBRX:
14533
15360
  case LLM_ARCH_PERSIMMON:
14534
15361
  case LLM_ARCH_BERT:
14535
15362
  case LLM_ARCH_NOMIC_BERT:
14536
15363
  case LLM_ARCH_STABLELM:
14537
15364
  case LLM_ARCH_QWEN:
14538
15365
  case LLM_ARCH_QWEN2:
15366
+ case LLM_ARCH_QWEN2MOE:
14539
15367
  case LLM_ARCH_PHI2:
14540
15368
  case LLM_ARCH_GEMMA:
14541
15369
  case LLM_ARCH_STARCODER2:
@@ -14905,9 +15733,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
14905
15733
  llama_kv_cache_update_internal(*ctx);
14906
15734
  }
14907
15735
 
15736
+ // deprecated
15737
+ size_t llama_get_state_size(const struct llama_context * ctx) {
15738
+ return llama_state_get_size(ctx);
15739
+ }
15740
+
15741
+ // deprecated
15742
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15743
+ return llama_state_get_data(ctx, dst);
15744
+ }
15745
+
15746
+ // deprecated
15747
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15748
+ return llama_state_set_data(ctx, src);
15749
+ }
15750
+
15751
+ // deprecated
15752
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15753
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15754
+ }
15755
+
15756
+ // deprecated
15757
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15758
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
15759
+ }
14908
15760
 
14909
15761
  // Returns the *maximum* size of the state
14910
- size_t llama_get_state_size(const struct llama_context * ctx) {
15762
+ size_t llama_state_get_size(const struct llama_context * ctx) {
14911
15763
  const auto & cparams = ctx->cparams;
14912
15764
  const auto & hparams = ctx->model.hparams;
14913
15765
 
@@ -14995,15 +15847,15 @@ struct llama_data_file_context : llama_data_context {
14995
15847
  * file context:
14996
15848
  * llama_file file("/path", "wb");
14997
15849
  * llama_data_file_context data_ctx(&file);
14998
- * llama_copy_state_data(ctx, &data_ctx);
15850
+ * llama_state_get_data(ctx, &data_ctx);
14999
15851
  *
15000
15852
  * buffer context:
15001
15853
  * std::vector<uint8_t> buf(max_size, 0);
15002
15854
  * llama_data_buffer_context data_ctx(&buf.data());
15003
- * llama_copy_state_data(ctx, &data_ctx);
15855
+ * llama_state_get_data(ctx, &data_ctx);
15004
15856
  *
15005
15857
  */
15006
- static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15858
+ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15007
15859
  // copy rng
15008
15860
  {
15009
15861
  std::ostringstream rng_ss;
@@ -15147,15 +15999,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
15147
15999
  }
15148
16000
  }
15149
16001
 
15150
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
16002
+ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15151
16003
  llama_data_buffer_context data_ctx(dst);
15152
- llama_copy_state_data_internal(ctx, &data_ctx);
16004
+ llama_state_get_data_internal(ctx, &data_ctx);
15153
16005
 
15154
16006
  return data_ctx.get_size_written();
15155
16007
  }
15156
16008
 
15157
16009
  // Sets the state reading from the specified source address
15158
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
16010
+ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15159
16011
  const uint8_t * inp = src;
15160
16012
 
15161
16013
  // set rng
@@ -15192,6 +16044,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15192
16044
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
16045
  ctx->output_ids[id] = i;
15194
16046
  }
16047
+
16048
+ ctx->n_outputs = n_outputs;
15195
16049
  }
15196
16050
  }
15197
16051
 
@@ -15307,14 +16161,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15307
16161
  }
15308
16162
 
15309
16163
  const size_t nread = inp - src;
15310
- const size_t max_size = llama_get_state_size(ctx);
16164
+ const size_t max_size = llama_state_get_size(ctx);
15311
16165
 
15312
16166
  GGML_ASSERT(nread <= max_size);
15313
16167
 
15314
16168
  return nread;
15315
16169
  }
15316
16170
 
15317
- static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16171
+ static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15318
16172
  llama_file file(path_session, "rb");
15319
16173
 
15320
16174
  // sanity checks
@@ -15352,7 +16206,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15352
16206
  // restore the context state
15353
16207
  {
15354
16208
  const size_t n_state_size_cur = file.size - file.tell();
15355
- const size_t n_state_size_max = llama_get_state_size(ctx);
16209
+ const size_t n_state_size_max = llama_state_get_size(ctx);
15356
16210
 
15357
16211
  if (n_state_size_cur > n_state_size_max) {
15358
16212
  LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
@@ -15362,22 +16216,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15362
16216
  std::vector<uint8_t> state_data(n_state_size_max);
15363
16217
  file.read_raw(state_data.data(), n_state_size_cur);
15364
16218
 
15365
- llama_set_state_data(ctx, state_data.data());
16219
+ llama_state_set_data(ctx, state_data.data());
15366
16220
  }
15367
16221
 
15368
16222
  return true;
15369
16223
  }
15370
16224
 
15371
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16225
+ bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15372
16226
  try {
15373
- return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
16227
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15374
16228
  } catch (const std::exception & err) {
15375
16229
  LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
15376
16230
  return false;
15377
16231
  }
15378
16232
  }
15379
16233
 
15380
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
16234
+ static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15381
16235
  llama_file file(path_session, "wb");
15382
16236
 
15383
16237
  file.write_u32(LLAMA_SESSION_MAGIC);
@@ -15391,11 +16245,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
15391
16245
 
15392
16246
  // save the context state using stream saving
15393
16247
  llama_data_file_context data_ctx(&file);
15394
- llama_copy_state_data_internal(ctx, &data_ctx);
16248
+ llama_state_get_data_internal(ctx, &data_ctx);
15395
16249
 
15396
16250
  return true;
15397
16251
  }
15398
16252
 
16253
+ bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
16254
+ try {
16255
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
16256
+ } catch (const std::exception & err) {
16257
+ LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
16258
+ return false;
16259
+ }
16260
+ }
16261
+
16262
+ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
16263
+ // save the size of size_t as a uint32_t for safety check
16264
+ const size_t size_t_size_size = sizeof(uint32_t);
16265
+
16266
+ // other values
16267
+ const size_t s_cell_count_size = sizeof(uint32_t);
16268
+ const size_t s_layer_count_size = sizeof(uint32_t);
16269
+ const size_t n_embd_v_gqa_size = sizeof(uint32_t);
16270
+
16271
+ size_t s_cell_count = 0;
16272
+ size_t s_cell_data_size = 0;
16273
+ const auto & kv_self = ctx->kv_self;
16274
+ const auto & hparams = ctx->model.hparams;
16275
+
16276
+ const uint32_t n_layer = hparams.n_layer;
16277
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16278
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16279
+
16280
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
16281
+ const auto & cell = kv_self.cells[i];
16282
+ if (cell.seq_id.count(seq_id) > 0) {
16283
+ ++s_cell_count;
16284
+ s_cell_data_size += sizeof(llama_pos);
16285
+ }
16286
+ }
16287
+
16288
+ for (int il = 0; il < (int)n_layer; ++il) {
16289
+ // types of keys and values
16290
+ s_cell_data_size += sizeof(int32_t) * 2;
16291
+ // k_size_row and v_size_el values of layer
16292
+ s_cell_data_size += sizeof(size_t) * 2;
16293
+
16294
+ // keys
16295
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16296
+ s_cell_data_size += k_size_row * s_cell_count;
16297
+
16298
+ // values (transposed)
16299
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16300
+ s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
16301
+ }
16302
+
16303
+ const size_t s_total = (
16304
+ size_t_size_size +
16305
+ s_cell_count_size +
16306
+ s_layer_count_size +
16307
+ n_embd_v_gqa_size +
16308
+ s_cell_data_size
16309
+ );
16310
+
16311
+ return s_total;
16312
+ }
16313
+
16314
+ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16315
+ const auto & kv_self = ctx->kv_self;
16316
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
+
16318
+ // Save the size of size_t as a uint32_t for safety check
16319
+ const uint32_t size_t_size = sizeof(size_t);
16320
+ data_ctx.write(&size_t_size, sizeof(size_t_size));
16321
+
16322
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
16323
+ uint32_t cell_count = 0;
16324
+
16325
+ // Count the number of cells with the specified seq_id
16326
+ // Find all the ranges of cells with this seq id
16327
+ {
16328
+ uint32_t cell_range_begin = kv_self.size;
16329
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
16330
+ const auto & cell = kv_self.cells[i];
16331
+ if (cell.has_seq_id(seq_id)) {
16332
+ ++cell_count;
16333
+ if (cell_range_begin == kv_self.size) {
16334
+ cell_range_begin = i;
16335
+ }
16336
+ }
16337
+ else {
16338
+ if (cell_range_begin != kv_self.size) {
16339
+ cell_ranges.push_back({ cell_range_begin, i });
16340
+ cell_range_begin = kv_self.size;
16341
+ }
16342
+ }
16343
+ }
16344
+ if (cell_range_begin != kv_self.size) {
16345
+ cell_ranges.push_back({ cell_range_begin, kv_self.size });
16346
+ }
16347
+
16348
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
16349
+ uint32_t cell_count_check = 0;
16350
+ for (const auto & range : cell_ranges) {
16351
+ cell_count_check += range.second - range.first;
16352
+ }
16353
+ GGML_ASSERT(cell_count == cell_count_check);
16354
+ }
16355
+
16356
+ // Write the cell count
16357
+ data_ctx.write(&cell_count, sizeof(cell_count));
16358
+
16359
+ const auto & hparams = ctx->model.hparams;
16360
+ const uint32_t n_layer = hparams.n_layer;
16361
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16362
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16363
+
16364
+ // Write the layer count
16365
+ data_ctx.write(&n_layer, sizeof(n_layer));
16366
+
16367
+ // Write n_embd_v_gqa
16368
+ data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
16369
+
16370
+ // Iterate the ranges and write all the pos (this is the token position in the prompt)
16371
+ for (const auto & range : cell_ranges) {
16372
+ for (uint32_t i = range.first; i < range.second; ++i) {
16373
+ const auto & cell = kv_self.cells[i];
16374
+ data_ctx.write(&cell.pos, sizeof(cell.pos));
16375
+ }
16376
+ }
16377
+
16378
+ // Iterate and write all the keys first, each row is a cell
16379
+ // Get whole range at a time
16380
+ std::vector<uint8_t> tmp_buf;
16381
+ for (int il = 0; il < (int)n_layer; ++il) {
16382
+ // Write key type
16383
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
16384
+ data_ctx.write(&k_type_i, sizeof(k_type_i));
16385
+
16386
+ // Write row size of key
16387
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16388
+ data_ctx.write(&k_size_row, sizeof(k_size_row));
16389
+
16390
+ // Read each range of cells of k_size length each into tmp_buf and write out
16391
+ for (const auto & range : cell_ranges) {
16392
+ const size_t range_size = range.second - range.first;
16393
+ tmp_buf.resize(range_size * k_size_row);
16394
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
16395
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16396
+ }
16397
+ }
16398
+
16399
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16400
+ const uint32_t kv_size = kv_self.size;
16401
+ for (int il = 0; il < (int)n_layer; ++il) {
16402
+ // Write value type
16403
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16404
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16405
+
16406
+ // Write element size
16407
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16408
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16409
+
16410
+ // For each row, we get the element values of each cell
16411
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16412
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16413
+ for (const auto & range : cell_ranges) {
16414
+ const size_t range_size = range.second - range.first;
16415
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16416
+ tmp_buf.resize(range_size * v_size_el);
16417
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16418
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16419
+ }
16420
+ }
16421
+ }
16422
+
16423
+ return data_ctx.get_size_written();
16424
+ }
16425
+
16426
+ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
16427
+ llama_data_buffer_context data_ctx(dst);
16428
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
16429
+ }
16430
+
16431
+ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16432
+ auto & kv_self = ctx->kv_self;
16433
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
+
16435
+ // Wipe the slot
16436
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16437
+
16438
+ const uint8_t * inp = src;
16439
+
16440
+ // Read size of size_t
16441
+ uint32_t size_t_size;
16442
+ memcpy(&size_t_size, inp, sizeof(size_t_size));
16443
+ inp += sizeof(size_t_size);
16444
+ if (size_t_size != sizeof(size_t)) {
16445
+ LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
16446
+ return 0;
16447
+ }
16448
+
16449
+ // Read the cell count
16450
+ uint32_t cell_count;
16451
+ memcpy(&cell_count, inp, sizeof(cell_count));
16452
+ inp += sizeof(cell_count);
16453
+
16454
+ // Read the layer count
16455
+ uint32_t n_layer_ref;
16456
+ memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
16457
+ inp += sizeof(n_layer_ref);
16458
+
16459
+ // Read n_embd_v_gqa
16460
+ uint32_t n_embd_v_gqa_ref;
16461
+ memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
16462
+ inp += sizeof(n_embd_v_gqa_ref);
16463
+
16464
+ // Sanity check model compatibility
16465
+ const auto & hparams = ctx->model.hparams;
16466
+ const uint32_t n_layer = hparams.n_layer;
16467
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16468
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16469
+ if (n_layer != n_layer_ref) {
16470
+ LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
16471
+ return 0;
16472
+ }
16473
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
16474
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
16475
+ return 0;
16476
+ }
16477
+
16478
+ // Allocate the new cells for the slot
16479
+ if (cell_count) {
16480
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
16481
+ batch.n_tokens = cell_count;
16482
+ for (uint32_t i = 0; i < cell_count; ++i) {
16483
+ llama_pos pos;
16484
+ memcpy(&pos, inp, sizeof(pos));
16485
+ inp += sizeof(pos);
16486
+
16487
+ batch.pos[i] = pos;
16488
+ batch.n_seq_id[i] = 1;
16489
+ batch.seq_id[i][0] = dest_seq_id;
16490
+ }
16491
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
16492
+ llama_batch_free(batch);
16493
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
16494
+ return 0;
16495
+ }
16496
+
16497
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
16498
+ // Assume that this is one contiguous block of cells
16499
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
16500
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
16501
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
16502
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
16503
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
16504
+
16505
+ // Cleanup
16506
+ llama_batch_free(batch);
16507
+ }
16508
+
16509
+ const uint32_t kv_size = kv_self.size;
16510
+ const uint32_t kv_head = kv_self.head;
16511
+
16512
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
16513
+ for (int il = 0; il < (int)n_layer; ++il) {
16514
+ // Read type of key
16515
+ int32_t k_type_i_ref;
16516
+ memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
16517
+ inp += sizeof(k_type_i_ref);
16518
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
16519
+ if (k_type_i != k_type_i_ref) {
16520
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16521
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
16522
+ return 0;
16523
+ }
16524
+
16525
+ // Read row size of key
16526
+ size_t k_size_row_ref;
16527
+ memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
16528
+ inp += sizeof(k_size_row_ref);
16529
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16530
+ if (k_size_row != k_size_row_ref) {
16531
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16532
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
16533
+ return 0;
16534
+ }
16535
+
16536
+ if (cell_count) {
16537
+ // Read and set the keys for the whole cell range
16538
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
16539
+ inp += cell_count * k_size_row;
16540
+ }
16541
+ }
16542
+
16543
+ // For each layer, read the values for each cell (transposed)
16544
+ for (int il = 0; il < (int)n_layer; ++il) {
16545
+ // Read type of value
16546
+ int32_t v_type_i_ref;
16547
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16548
+ inp += sizeof(v_type_i_ref);
16549
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16550
+ if (v_type_i != v_type_i_ref) {
16551
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16552
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16553
+ return 0;
16554
+ }
16555
+
16556
+ // Read element size of value
16557
+ size_t v_size_el_ref;
16558
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16559
+ inp += sizeof(v_size_el_ref);
16560
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16561
+ if (v_size_el != v_size_el_ref) {
16562
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16563
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16564
+ return 0;
16565
+ }
16566
+
16567
+ if (cell_count) {
16568
+ // For each row in the transposed matrix, read the values for the whole cell range
16569
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16570
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16571
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16572
+ inp += cell_count * v_size_el;
16573
+ }
16574
+ }
16575
+ }
16576
+
16577
+ const size_t nread = inp - src;
16578
+ return nread;
16579
+ }
16580
+
16581
+ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
16582
+ llama_file file(filepath, "wb");
16583
+
16584
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
16585
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
16586
+
16587
+ // save the prompt
16588
+ file.write_u32((uint32_t)n_token_count);
16589
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
16590
+
16591
+ // save the context state using stream saving
16592
+ llama_data_file_context data_ctx(&file);
16593
+ llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
16594
+
16595
+ const size_t res = file.tell();
16596
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
16597
+ return res;
16598
+ }
16599
+
16600
+ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16601
+ llama_file file(filepath, "rb");
16602
+
16603
+ // version checks
16604
+ {
16605
+ const uint32_t magic = file.read_u32();
16606
+ const uint32_t version = file.read_u32();
16607
+
16608
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
16609
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
16610
+ return 0;
16611
+ }
16612
+ }
16613
+
16614
+ // load the prompt
16615
+ {
16616
+ const uint32_t n_token_count = file.read_u32();
16617
+
16618
+ if (n_token_count > n_token_capacity) {
16619
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
16620
+ return 0;
16621
+ }
16622
+
16623
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
16624
+ *n_token_count_out = n_token_count;
16625
+ }
16626
+
16627
+ // restore the context state
16628
+ {
16629
+ const size_t state_size = file.size - file.tell();
16630
+ std::vector<uint8_t> state_data(state_size);
16631
+ file.read_raw(state_data.data(), state_size);
16632
+ const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
16633
+ if (!nread) {
16634
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
16635
+ return 0;
16636
+ }
16637
+ GGML_ASSERT(nread <= state_size);
16638
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
16639
+ }
16640
+
16641
+ return file.tell();
16642
+ }
16643
+
16644
+ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
16645
+ try {
16646
+ return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
16647
+ } catch (const std::exception & err) {
16648
+ LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
16649
+ return 0;
16650
+ }
16651
+ }
16652
+
16653
+ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16654
+ try {
16655
+ return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
16656
+ } catch (const std::exception & err) {
16657
+ LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
16658
+ return 0;
16659
+ }
16660
+ }
16661
+
15399
16662
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
15400
16663
  ctx->cparams.n_threads = n_threads;
15401
16664
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -15509,23 +16772,31 @@ float * llama_get_logits(struct llama_context * ctx) {
15509
16772
  }
15510
16773
 
15511
16774
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
16775
+ int32_t j = -1;
15512
16776
  llama_synchronize(ctx);
15513
16777
 
15514
16778
  try {
15515
16779
  if (ctx->logits == nullptr) {
15516
16780
  throw std::runtime_error("no logits");
15517
16781
  }
15518
- if ((size_t) i >= ctx->output_ids.size()) {
16782
+
16783
+ if (i < 0) {
16784
+ j = ctx->n_outputs + i;
16785
+ if (j < 0) {
16786
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16787
+ }
16788
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15519
16789
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16790
+ } else {
16791
+ j = ctx->output_ids[i];
15520
16792
  }
15521
- const int32_t j = ctx->output_ids[i];
15522
16793
 
15523
16794
  if (j < 0) {
15524
16795
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
16796
  }
15526
- if ((size_t) j >= ctx->output_size) {
16797
+ if (j >= ctx->n_outputs) {
15527
16798
  // This should not happen
15528
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16799
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15529
16800
  }
15530
16801
 
15531
16802
  return ctx->logits + j*ctx->model.hparams.n_vocab;
@@ -15545,23 +16816,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
15545
16816
  }
15546
16817
 
15547
16818
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
16819
+ int32_t j = -1;
16820
+
15548
16821
  llama_synchronize(ctx);
15549
16822
 
15550
16823
  try {
15551
16824
  if (ctx->embd == nullptr) {
15552
16825
  throw std::runtime_error("no embeddings");
15553
16826
  }
15554
- if ((size_t) i >= ctx->output_ids.size()) {
16827
+
16828
+ if (i < 0) {
16829
+ j = ctx->n_outputs + i;
16830
+ if (j < 0) {
16831
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16832
+ }
16833
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15555
16834
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16835
+ } else {
16836
+ j = ctx->output_ids[i];
15556
16837
  }
15557
- const int32_t j = ctx->output_ids[i];
15558
16838
 
15559
16839
  if (j < 0) {
15560
16840
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
16841
  }
15562
- if ((size_t) j >= ctx->output_size) {
16842
+ if (j >= ctx->n_outputs) {
15563
16843
  // This should not happen
15564
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16844
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15565
16845
  }
15566
16846
 
15567
16847
  return ctx->embd + j*ctx->model.hparams.n_embd;
@@ -15608,6 +16888,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
15608
16888
  return model->vocab.special_eos_id;
15609
16889
  }
15610
16890
 
16891
+ llama_token llama_token_cls(const struct llama_model * model) {
16892
+ return model->vocab.special_cls_id;
16893
+ }
16894
+
16895
+ llama_token llama_token_sep(const struct llama_model * model) {
16896
+ return model->vocab.special_sep_id;
16897
+ }
16898
+
15611
16899
  llama_token llama_token_nl(const struct llama_model * model) {
15612
16900
  return model->vocab.linefeed_id;
15613
16901
  }
@@ -15642,9 +16930,9 @@ int32_t llama_tokenize(
15642
16930
  int32_t text_len,
15643
16931
  llama_token * tokens,
15644
16932
  int32_t n_tokens_max,
15645
- bool add_bos,
15646
- bool special) {
15647
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
16933
+ bool add_special,
16934
+ bool parse_special) {
16935
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
15648
16936
 
15649
16937
  if (n_tokens_max < (int) res.size()) {
15650
16938
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -15910,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
15910
17198
  if (add_ass) {
15911
17199
  ss << "### Response:\n";
15912
17200
  }
17201
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17202
+ // CohereForAI/c4ai-command-r-plus
17203
+ for (auto message : chat) {
17204
+ std::string role(message->role);
17205
+ if (role == "system") {
17206
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17207
+ } else if (role == "user") {
17208
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17209
+ } else if (role == "assistant") {
17210
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17211
+ }
17212
+ }
17213
+ if (add_ass) {
17214
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
+ }
15913
17216
  } else {
15914
17217
  // template not supported
15915
17218
  return -1;