llama_cpp 0.14.4 → 0.14.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,6 +209,7 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
213
214
  LLM_ARCH_PLAMO,
214
215
  LLM_ARCH_CODESHELL,
@@ -220,6 +221,8 @@ enum llm_arch {
220
221
  LLM_ARCH_MAMBA,
221
222
  LLM_ARCH_XVERSE,
222
223
  LLM_ARCH_COMMAND_R,
224
+ LLM_ARCH_DBRX,
225
+ LLM_ARCH_OLMO,
223
226
  LLM_ARCH_UNKNOWN,
224
227
  };
225
228
 
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
244
  { LLM_ARCH_STABLELM, "stablelm" },
242
245
  { LLM_ARCH_QWEN, "qwen" },
243
246
  { LLM_ARCH_QWEN2, "qwen2" },
247
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
248
  { LLM_ARCH_PHI2, "phi2" },
245
249
  { LLM_ARCH_PLAMO, "plamo" },
246
250
  { LLM_ARCH_CODESHELL, "codeshell" },
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
256
  { LLM_ARCH_MAMBA, "mamba" },
253
257
  { LLM_ARCH_XVERSE, "xverse" },
254
258
  { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
255
261
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
262
  };
257
263
 
@@ -261,6 +267,7 @@ enum llm_kv {
261
267
  LLM_KV_GENERAL_ALIGNMENT,
262
268
  LLM_KV_GENERAL_NAME,
263
269
  LLM_KV_GENERAL_AUTHOR,
270
+ LLM_KV_GENERAL_VERSION,
264
271
  LLM_KV_GENERAL_URL,
265
272
  LLM_KV_GENERAL_DESCRIPTION,
266
273
  LLM_KV_GENERAL_LICENSE,
@@ -317,11 +324,17 @@ enum llm_kv {
317
324
  LLM_KV_TOKENIZER_UNK_ID,
318
325
  LLM_KV_TOKENIZER_SEP_ID,
319
326
  LLM_KV_TOKENIZER_PAD_ID,
327
+ LLM_KV_TOKENIZER_CLS_ID,
328
+ LLM_KV_TOKENIZER_MASK_ID,
320
329
  LLM_KV_TOKENIZER_ADD_BOS,
321
330
  LLM_KV_TOKENIZER_ADD_EOS,
322
331
  LLM_KV_TOKENIZER_ADD_PREFIX,
323
332
  LLM_KV_TOKENIZER_HF_JSON,
324
333
  LLM_KV_TOKENIZER_RWKV,
334
+ LLM_KV_TOKENIZER_PREFIX_ID,
335
+ LLM_KV_TOKENIZER_SUFFIX_ID,
336
+ LLM_KV_TOKENIZER_MIDDLE_ID,
337
+ LLM_KV_TOKENIZER_EOT_ID,
325
338
  };
326
339
 
327
340
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -330,6 +343,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
330
343
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
331
344
  { LLM_KV_GENERAL_NAME, "general.name" },
332
345
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
346
+ { LLM_KV_GENERAL_VERSION, "general.version" },
333
347
  { LLM_KV_GENERAL_URL, "general.url" },
334
348
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
335
349
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -386,11 +400,17 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
386
400
  { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
387
401
  { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
388
402
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
403
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
404
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
389
405
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
390
406
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
391
407
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
392
408
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
393
409
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
410
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
411
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
412
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
413
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
394
414
  };
395
415
 
396
416
  struct LLM_KV {
@@ -421,6 +441,7 @@ enum llm_tensor {
421
441
  LLM_TENSOR_ATTN_OUT_NORM,
422
442
  LLM_TENSOR_ATTN_ROT_EMBD,
423
443
  LLM_TENSOR_FFN_GATE_INP,
444
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
424
445
  LLM_TENSOR_FFN_NORM,
425
446
  LLM_TENSOR_FFN_GATE,
426
447
  LLM_TENSOR_FFN_DOWN,
@@ -432,6 +453,9 @@ enum llm_tensor {
432
453
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
433
454
  LLM_TENSOR_FFN_GATE_EXPS,
434
455
  LLM_TENSOR_FFN_UP_EXPS,
456
+ LLM_TENSOR_FFN_DOWN_SHEXP,
457
+ LLM_TENSOR_FFN_GATE_SHEXP,
458
+ LLM_TENSOR_FFN_UP_SHEXP,
435
459
  LLM_TENSOR_ATTN_Q_NORM,
436
460
  LLM_TENSOR_ATTN_K_NORM,
437
461
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -694,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
694
718
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
695
719
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
696
720
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
721
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
722
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
697
723
  },
698
724
  },
699
725
  {
@@ -729,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
729
755
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
730
756
  },
731
757
  },
758
+ {
759
+ LLM_ARCH_QWEN2MOE,
760
+ {
761
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
762
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
763
+ { LLM_TENSOR_OUTPUT, "output" },
764
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
765
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
766
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
767
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
768
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
769
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
770
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
771
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
772
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
773
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
774
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
775
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
776
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
777
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
778
+ },
779
+ },
732
780
  {
733
781
  LLM_ARCH_PHI2,
734
782
  {
@@ -924,6 +972,38 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
924
972
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
925
973
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
926
974
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
975
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
976
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
977
+ },
978
+ },
979
+ {
980
+ LLM_ARCH_DBRX,
981
+ {
982
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
983
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
984
+ { LLM_TENSOR_OUTPUT, "output" },
985
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
986
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
987
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
988
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
989
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
990
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
991
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
992
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
993
+ },
994
+ },
995
+ {
996
+ LLM_ARCH_OLMO,
997
+ {
998
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
999
+ { LLM_TENSOR_OUTPUT, "output" },
1000
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1001
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1002
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1003
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1004
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1005
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1006
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
927
1007
  },
928
1008
  },
929
1009
  {
@@ -1630,17 +1710,17 @@ static size_t llama_get_device_memory(int device) {
1630
1710
  #if defined(GGML_USE_CUDA)
1631
1711
  size_t total;
1632
1712
  size_t free;
1633
- ggml_backend_cuda_get_device_memory(device, &total, &free);
1713
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
1634
1714
  return free;
1635
1715
  #elif defined(GGML_USE_SYCL)
1636
1716
  size_t total;
1637
1717
  size_t free;
1638
- ggml_backend_sycl_get_device_memory(device, &total, &free);
1718
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
1639
1719
  return free;
1640
1720
  #elif defined(GGML_USE_VULKAN)
1641
1721
  size_t total;
1642
1722
  size_t free;
1643
- ggml_backend_vk_get_device_memory(device, &total, &free);
1723
+ ggml_backend_vk_get_device_memory(device, &free, &total);
1644
1724
  return free;
1645
1725
  #else
1646
1726
  return 1;
@@ -1682,6 +1762,7 @@ enum e_model {
1682
1762
  MODEL_4B,
1683
1763
  MODEL_7B,
1684
1764
  MODEL_8B,
1765
+ MODEL_12B,
1685
1766
  MODEL_13B,
1686
1767
  MODEL_14B,
1687
1768
  MODEL_15B,
@@ -1697,6 +1778,10 @@ enum e_model {
1697
1778
  MODEL_MEDIUM,
1698
1779
  MODEL_LARGE,
1699
1780
  MODEL_XL,
1781
+ MODEL_A2_7B,
1782
+ MODEL_8x7B,
1783
+ MODEL_8x22B,
1784
+ MODEL_16x12B,
1700
1785
  };
1701
1786
 
1702
1787
  static const size_t kiB = 1024;
@@ -1880,6 +1965,12 @@ struct llama_layer {
1880
1965
  struct ggml_tensor * ffn_down_exps;
1881
1966
  struct ggml_tensor * ffn_up_exps ;
1882
1967
 
1968
+ // ff shared expert (shexp)
1969
+ struct ggml_tensor * ffn_gate_inp_shexp;
1970
+ struct ggml_tensor * ffn_gate_shexp;
1971
+ struct ggml_tensor * ffn_down_shexp;
1972
+ struct ggml_tensor * ffn_up_shexp;
1973
+
1883
1974
  // ff bias
1884
1975
  struct ggml_tensor * ffn_down_b; // b2
1885
1976
  struct ggml_tensor * ffn_up_b; // b3
@@ -2014,20 +2105,22 @@ struct llama_vocab {
2014
2105
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
2015
2106
 
2016
2107
  // default LLaMA special tokens
2017
- id special_bos_id = 1;
2018
- id special_eos_id = 2;
2019
- id special_unk_id = 0;
2020
- id special_sep_id = -1;
2021
- id special_pad_id = -1;
2108
+ id special_bos_id = 1;
2109
+ id special_eos_id = 2;
2110
+ id special_unk_id = 0;
2111
+ id special_sep_id = -1;
2112
+ id special_pad_id = -1;
2113
+ id special_cls_id = -1;
2114
+ id special_mask_id = -1;
2022
2115
 
2023
2116
  int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2024
2117
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2025
2118
 
2026
2119
  id linefeed_id = 13;
2027
- id special_prefix_id = 32007;
2028
- id special_middle_id = 32009;
2029
- id special_suffix_id = 32008;
2030
- id special_eot_id = 32010;
2120
+ id special_prefix_id = -1;
2121
+ id special_suffix_id = -1;
2122
+ id special_middle_id = -1;
2123
+ id special_eot_id = -1;
2031
2124
 
2032
2125
  bool add_space_prefix = true;
2033
2126
 
@@ -2175,7 +2268,7 @@ struct llama_context {
2175
2268
 
2176
2269
  std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
2177
2270
  size_t output_size = 0; // capacity (of tokens positions) for the output buffers
2178
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
2271
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
2179
2272
 
2180
2273
  bool logits_all = false;
2181
2274
 
@@ -3533,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
3533
3626
  case MODEL_3B: return "3B";
3534
3627
  case MODEL_7B: return "7B";
3535
3628
  case MODEL_8B: return "8B";
3629
+ case MODEL_12B: return "12B";
3536
3630
  case MODEL_13B: return "13B";
3537
3631
  case MODEL_14B: return "14B";
3538
3632
  case MODEL_15B: return "15B";
@@ -3548,6 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
3548
3642
  case MODEL_MEDIUM: return "0.4B";
3549
3643
  case MODEL_LARGE: return "0.8B";
3550
3644
  case MODEL_XL: return "1.5B";
3645
+ case MODEL_A2_7B: return "A2.7B";
3646
+ case MODEL_8x7B: return "8x7B";
3647
+ case MODEL_8x22B: return "8x22B";
3648
+ case MODEL_16x12B: return "16x12B";
3551
3649
  default: return "?B";
3552
3650
  }
3553
3651
  }
@@ -3662,15 +3760,23 @@ static void llm_load_hparams(
3662
3760
  {
3663
3761
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3664
3762
 
3665
- switch (hparams.n_layer) {
3666
- case 22: model.type = e_model::MODEL_1B; break;
3667
- case 26: model.type = e_model::MODEL_3B; break;
3668
- case 32: model.type = e_model::MODEL_7B; break;
3669
- case 40: model.type = e_model::MODEL_13B; break;
3670
- case 48: model.type = e_model::MODEL_34B; break;
3671
- case 60: model.type = e_model::MODEL_30B; break;
3672
- case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3673
- default: model.type = e_model::MODEL_UNKNOWN;
3763
+ if (hparams.n_expert == 8) {
3764
+ switch (hparams.n_layer) {
3765
+ case 32: model.type = e_model::MODEL_8x7B; break;
3766
+ case 56: model.type = e_model::MODEL_8x22B; break;
3767
+ default: model.type = e_model::MODEL_UNKNOWN;
3768
+ }
3769
+ } else {
3770
+ switch (hparams.n_layer) {
3771
+ case 22: model.type = e_model::MODEL_1B; break;
3772
+ case 26: model.type = e_model::MODEL_3B; break;
3773
+ case 32: model.type = e_model::MODEL_7B; break;
3774
+ case 40: model.type = e_model::MODEL_13B; break;
3775
+ case 48: model.type = e_model::MODEL_34B; break;
3776
+ case 60: model.type = e_model::MODEL_30B; break;
3777
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
3778
+ default: model.type = e_model::MODEL_UNKNOWN;
3779
+ }
3674
3780
  }
3675
3781
  } break;
3676
3782
  case LLM_ARCH_MINICPM:
@@ -3812,6 +3918,7 @@ static void llm_load_hparams(
3812
3918
  switch (hparams.n_layer) {
3813
3919
  case 24: model.type = e_model::MODEL_1B; break;
3814
3920
  case 32: model.type = e_model::MODEL_3B; break;
3921
+ case 40: model.type = e_model::MODEL_12B; break;
3815
3922
  default: model.type = e_model::MODEL_UNKNOWN;
3816
3923
  }
3817
3924
  } break;
@@ -3836,6 +3943,14 @@ static void llm_load_hparams(
3836
3943
  default: model.type = e_model::MODEL_UNKNOWN;
3837
3944
  }
3838
3945
  } break;
3946
+ case LLM_ARCH_QWEN2MOE:
3947
+ {
3948
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3949
+ switch (hparams.n_layer) {
3950
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3951
+ default: model.type = e_model::MODEL_UNKNOWN;
3952
+ }
3953
+ } break;
3839
3954
  case LLM_ARCH_PHI2:
3840
3955
  {
3841
3956
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3961,6 +4076,28 @@ static void llm_load_hparams(
3961
4076
  default: model.type = e_model::MODEL_UNKNOWN;
3962
4077
  }
3963
4078
  } break;
4079
+ case LLM_ARCH_DBRX:
4080
+ {
4081
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4082
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4083
+
4084
+ switch (hparams.n_layer) {
4085
+ case 40: model.type = e_model::MODEL_16x12B; break;
4086
+ default: model.type = e_model::MODEL_UNKNOWN;
4087
+ }
4088
+ } break;
4089
+ case LLM_ARCH_OLMO:
4090
+ {
4091
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4092
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4093
+
4094
+ switch (hparams.n_layer) {
4095
+ case 22: model.type = e_model::MODEL_1B; break;
4096
+ case 32: model.type = e_model::MODEL_7B; break;
4097
+ case 80: model.type = e_model::MODEL_70B; break;
4098
+ default: model.type = e_model::MODEL_UNKNOWN;
4099
+ }
4100
+ } break;
3964
4101
  default: (void)0;
3965
4102
  }
3966
4103
 
@@ -3974,7 +4111,9 @@ static void llm_load_hparams(
3974
4111
  }
3975
4112
 
3976
4113
  // TODO: This should probably be in llama.h
3977
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
4114
+ static std::vector<llama_vocab::id> llama_tokenize_internal(
4115
+ const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
4116
+ );
3978
4117
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
3979
4118
 
3980
4119
  static void llm_load_vocab(
@@ -3996,23 +4135,53 @@ static void llm_load_vocab(
3996
4135
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
3997
4136
 
3998
4137
  // default special tokens
3999
- vocab.special_bos_id = -1;
4000
- vocab.special_eos_id = -1;
4001
- vocab.special_unk_id = -1;
4002
- vocab.special_sep_id = -1;
4003
- vocab.special_pad_id = -1;
4004
- vocab.linefeed_id = -1;
4138
+ vocab.special_bos_id = -1;
4139
+ vocab.special_eos_id = -1;
4140
+ vocab.special_unk_id = -1;
4141
+ vocab.special_sep_id = -1;
4142
+ vocab.special_pad_id = -1;
4143
+ vocab.special_cls_id = -1;
4144
+ vocab.special_mask_id = -1;
4145
+ vocab.linefeed_id = -1;
4005
4146
 
4006
4147
  return;
4007
4148
  } else if (tokenizer_name == "llama") {
4008
4149
  vocab.type = LLAMA_VOCAB_TYPE_SPM;
4009
4150
 
4010
4151
  // default special tokens
4011
- vocab.special_bos_id = 1;
4012
- vocab.special_eos_id = 2;
4013
- vocab.special_unk_id = 0;
4014
- vocab.special_sep_id = -1;
4015
- vocab.special_pad_id = -1;
4152
+ vocab.special_bos_id = 1;
4153
+ vocab.special_eos_id = 2;
4154
+ vocab.special_unk_id = 0;
4155
+ vocab.special_sep_id = -1;
4156
+ vocab.special_pad_id = -1;
4157
+ vocab.special_cls_id = -1;
4158
+ vocab.special_mask_id = -1;
4159
+
4160
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4161
+ // prior to support of FIM special tokens in GGUF, the following
4162
+ // will allow those models to continue to work. The general names
4163
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4164
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4165
+ // new versions of these models have been published.
4166
+ std::string gen_name;
4167
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4168
+
4169
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4170
+ [](unsigned char c){ return std::tolower(c); });
4171
+
4172
+ if (gen_name.find("code") != std::string::npos) {
4173
+ if (model.arch == LLM_ARCH_LLAMA) {
4174
+ vocab.special_prefix_id = 32007;
4175
+ vocab.special_suffix_id = 32008;
4176
+ vocab.special_middle_id = 32009;
4177
+ vocab.special_eot_id = 32010;
4178
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4179
+ vocab.special_prefix_id = 67;
4180
+ vocab.special_suffix_id = 69;
4181
+ vocab.special_middle_id = 68;
4182
+ vocab.special_eot_id = 70;
4183
+ }
4184
+ }
4016
4185
 
4017
4186
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4018
4187
  if (add_space_prefix_keyidx != -1) {
@@ -4047,20 +4216,24 @@ static void llm_load_vocab(
4047
4216
  }
4048
4217
 
4049
4218
  // default special tokens
4050
- vocab.special_bos_id = 11;
4051
- vocab.special_eos_id = 11;
4052
- vocab.special_unk_id = -1;
4053
- vocab.special_sep_id = -1;
4054
- vocab.special_pad_id = -1;
4219
+ vocab.special_bos_id = 11;
4220
+ vocab.special_eos_id = 11;
4221
+ vocab.special_unk_id = -1;
4222
+ vocab.special_sep_id = -1;
4223
+ vocab.special_pad_id = -1;
4224
+ vocab.special_cls_id = -1;
4225
+ vocab.special_mask_id = -1;
4055
4226
  } else if (tokenizer_name == "bert") {
4056
4227
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
4057
4228
 
4058
4229
  // default special tokens
4059
- vocab.special_bos_id = 101;
4060
- vocab.special_eos_id = 102;
4061
- vocab.special_unk_id = 100;
4062
- vocab.special_sep_id = -1;
4063
- vocab.special_pad_id = -1;
4230
+ vocab.special_bos_id = -1;
4231
+ vocab.special_eos_id = -1;
4232
+ vocab.special_unk_id = 100;
4233
+ vocab.special_sep_id = 102;
4234
+ vocab.special_pad_id = 0;
4235
+ vocab.special_cls_id = 101;
4236
+ vocab.special_mask_id = 103;
4064
4237
  vocab.add_space_prefix = false;
4065
4238
  } else {
4066
4239
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
@@ -4123,11 +4296,17 @@ static void llm_load_vocab(
4123
4296
  // special tokens
4124
4297
  {
4125
4298
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4126
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4127
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4128
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4129
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4130
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4299
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4300
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4301
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4302
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4303
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4304
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4305
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4306
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4307
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4308
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4131
4310
  };
4132
4311
  for (const auto & it : special_token_types) {
4133
4312
  const std::string & key = kv(std::get<0>(it));
@@ -4319,12 +4498,14 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4319
4498
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4320
4499
 
4321
4500
  // special tokens
4322
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4323
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4324
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4325
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4326
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4327
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4501
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4502
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4503
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4504
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4505
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4506
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4507
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4508
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4328
4509
  }
4329
4510
 
4330
4511
  // Returns false if cancelled by progress_callback
@@ -4342,6 +4523,13 @@ static bool llm_load_tensors(
4342
4523
 
4343
4524
  auto & hparams = model.hparams;
4344
4525
 
4526
+ #ifdef GGML_USE_SYCL
4527
+ // disable MoE with SYCL until mul_mat_id is updated
4528
+ if (hparams.n_expert > 0) {
4529
+ n_gpu_layers = 0;
4530
+ }
4531
+ #endif
4532
+
4345
4533
  model.split_mode = split_mode;
4346
4534
  model.main_gpu = main_gpu;
4347
4535
  model.n_gpu_layers = n_gpu_layers;
@@ -4439,7 +4627,7 @@ static bool llm_load_tensors(
4439
4627
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4440
4628
 
4441
4629
  // for moe merged tensors
4442
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4630
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4443
4631
 
4444
4632
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4445
4633
  for (auto & it : buft_layer_count) {
@@ -4635,6 +4823,39 @@ static bool llm_load_tensors(
4635
4823
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4636
4824
  }
4637
4825
  } break;
4826
+ case LLM_ARCH_DBRX:
4827
+ {
4828
+ if (n_expert == 0) {
4829
+ throw std::runtime_error("DBRX model cannot have zero experts");
4830
+ }
4831
+
4832
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+
4834
+ // output
4835
+ {
4836
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4837
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4838
+ }
4839
+
4840
+ for (int i = 0; i < n_layer; ++i) {
4841
+ ggml_context * ctx_layer = ctx_for_layer(i);
4842
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4843
+
4844
+ auto & layer = model.layers[i];
4845
+
4846
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4847
+
4848
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4849
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4850
+
4851
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4852
+
4853
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4854
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4855
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4856
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4857
+ }
4858
+ } break;
4638
4859
  case LLM_ARCH_BAICHUAN:
4639
4860
  {
4640
4861
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4949,8 +5170,13 @@ static bool llm_load_tensors(
4949
5170
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4950
5171
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4951
5172
 
4952
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4953
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5173
+ // optional q and k layernorms, present in StableLM 2 12B
5174
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5175
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5176
+
5177
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5178
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5179
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4954
5180
 
4955
5181
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4956
5182
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -4993,7 +5219,13 @@ static bool llm_load_tensors(
4993
5219
  // output
4994
5220
  {
4995
5221
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4996
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5222
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5223
+ // if output is NULL, init from the input tok embed
5224
+ if (model.output == NULL) {
5225
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5226
+ ml.n_created--; // artificial tensor
5227
+ ml.size_data += ggml_nbytes(model.output);
5228
+ }
4997
5229
  }
4998
5230
 
4999
5231
  for (int i = 0; i < n_layer; ++i) {
@@ -5021,6 +5253,54 @@ static bool llm_load_tensors(
5021
5253
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5022
5254
  }
5023
5255
  } break;
5256
+ case LLM_ARCH_QWEN2MOE:
5257
+ {
5258
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5259
+
5260
+ // output
5261
+ {
5262
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5263
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5264
+ }
5265
+
5266
+ for (int i = 0; i < n_layer; ++i) {
5267
+ ggml_context * ctx_layer = ctx_for_layer(i);
5268
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5269
+
5270
+ auto & layer = model.layers[i];
5271
+
5272
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5273
+
5274
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5275
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5276
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5277
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5278
+
5279
+ // optional bias tensors
5280
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5281
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5282
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5283
+
5284
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5285
+
5286
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5287
+
5288
+ GGML_ASSERT(hparams.n_expert > 0);
5289
+ GGML_ASSERT(hparams.n_expert_used > 0);
5290
+
5291
+ // MoE branch
5292
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5293
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5294
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5295
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5296
+
5297
+ // Shared expert branch
5298
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5299
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5300
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5301
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5302
+ }
5303
+ } break;
5024
5304
  case LLM_ARCH_PHI2:
5025
5305
  {
5026
5306
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5404,11 +5684,47 @@ static bool llm_load_tensors(
5404
5684
 
5405
5685
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5406
5686
 
5687
+ if (n_layer >= 64){
5688
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
5689
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
5690
+ }
5691
+
5692
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5693
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5694
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5695
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5696
+
5697
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5698
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5699
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5700
+ }
5701
+ } break;
5702
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5703
+ {
5704
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5705
+
5706
+ // output
5707
+ {
5708
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5709
+ // if output is NULL, init from the input tok embed
5710
+ if (model.output == NULL) {
5711
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5712
+ ml.n_created--; // artificial tensor
5713
+ ml.size_data += ggml_nbytes(model.output);
5714
+ }
5715
+ }
5716
+
5717
+ for (int i = 0; i < n_layer; ++i) {
5718
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5719
+
5720
+ auto & layer = model.layers[i];
5721
+
5407
5722
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5408
5723
  layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5409
5724
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5410
5725
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5411
5726
 
5727
+
5412
5728
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5413
5729
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5414
5730
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5849,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
5849
6165
  return cur;
5850
6166
  }
5851
6167
 
6168
+ static struct ggml_tensor * llm_build_moe_ffn(
6169
+ struct ggml_context * ctx,
6170
+ struct ggml_tensor * cur,
6171
+ struct ggml_tensor * gate_inp,
6172
+ struct ggml_tensor * up_exps,
6173
+ struct ggml_tensor * gate_exps,
6174
+ struct ggml_tensor * down_exps,
6175
+ int64_t n_expert,
6176
+ int64_t n_expert_used,
6177
+ llm_ffn_op_type type_op,
6178
+ bool norm_w,
6179
+ const llm_build_cb & cb,
6180
+ int il) {
6181
+ int64_t n_embd = cur->ne[0];
6182
+ int64_t n_tokens = cur->ne[1];
6183
+
6184
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6185
+ cb(logits, "ffn_moe_logits", il);
6186
+
6187
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6188
+ cb(probs, "ffn_moe_probs", il);
6189
+
6190
+ // select experts
6191
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6192
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6193
+ cb(selected_experts, "ffn_moe_topk", il);
6194
+
6195
+ ggml_tensor * weights = ggml_get_rows(ctx,
6196
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6197
+ cb(weights, "ffn_moe_weights", il);
6198
+
6199
+ if (norm_w) {
6200
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6201
+
6202
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6203
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6204
+
6205
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6206
+ cb(weights, "ffn_moe_weights_norm", il);
6207
+
6208
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6209
+ }
6210
+
6211
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6212
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6213
+ cb(up, "ffn_moe_up", il);
6214
+
6215
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6216
+ cb(gate, "ffn_moe_gate", il);
6217
+
6218
+ switch (type_op) {
6219
+ case LLM_FFN_SILU:
6220
+ {
6221
+ gate = ggml_silu(ctx, gate);
6222
+ cb(gate, "ffn_moe_silu", il);
6223
+ } break;
6224
+ case LLM_FFN_GELU:
6225
+ {
6226
+ gate = ggml_gelu(ctx, gate);
6227
+ cb(gate, "ffn_moe_gelu", il);
6228
+ } break;
6229
+ default:
6230
+ GGML_ASSERT(false);
6231
+ }
6232
+
6233
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6234
+ cb(par, "ffn_moe_gate_par", il);
6235
+
6236
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6237
+ cb(experts, "ffn_moe_down", il);
6238
+
6239
+ experts = ggml_mul(ctx, experts, weights);
6240
+
6241
+ // aggregate experts
6242
+ ggml_tensor * moe_out = nullptr;
6243
+ for (int i = 0; i < n_expert_used; ++i) {
6244
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6245
+ experts->nb[2], i*experts->nb[1]);
6246
+
6247
+ if (i == 0) {
6248
+ moe_out = cur_expert;
6249
+ } else {
6250
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6251
+ }
6252
+ }
6253
+
6254
+ if (n_expert_used == 1) {
6255
+ // avoid returning a non-contiguous tensor
6256
+ moe_out = ggml_cont(ctx, moe_out);
6257
+ }
6258
+
6259
+ return moe_out;
6260
+ }
6261
+
5852
6262
  // if max_alibi_bias > 0 then apply ALiBi
5853
6263
  static struct ggml_tensor * llm_build_kqv(
5854
6264
  struct ggml_context * ctx,
@@ -6392,62 +6802,15 @@ struct llm_build_context {
6392
6802
  LLM_NORM_RMS, cb, il);
6393
6803
  cb(cur, "ffn_norm", il);
6394
6804
 
6395
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6396
- cb(logits, "ffn_moe_logits", il);
6397
-
6398
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6399
- cb(probs, "ffn_moe_probs", il);
6400
-
6401
- // select experts
6402
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6403
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6404
-
6405
- ggml_tensor * weights = ggml_get_rows(ctx0,
6406
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6407
- cb(weights, "ffn_moe_weights", il);
6408
-
6409
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6410
-
6411
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6412
- cb(weights_sum, "ffn_moe_weights_sum", il);
6413
-
6414
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6415
- cb(weights, "ffn_moe_weights_norm", il);
6416
-
6417
- // compute expert outputs
6418
- ggml_tensor * moe_out = nullptr;
6419
-
6420
- for (int i = 0; i < n_expert_used; ++i) {
6421
- ggml_tensor * cur_expert;
6422
-
6423
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6424
- cb(cur_up, "ffn_moe_up", il);
6425
-
6426
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6427
- cb(cur_gate, "ffn_moe_gate", il);
6428
-
6429
- cur_gate = ggml_silu(ctx0, cur_gate);
6430
- cb(cur_gate, "ffn_moe_silu", il);
6431
-
6432
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6433
- cb(cur_expert, "ffn_moe_gate_par", il);
6434
-
6435
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6436
- cb(cur_expert, "ffn_moe_down", il);
6437
-
6438
- cur_expert = ggml_mul(ctx0, cur_expert,
6439
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6440
- cb(cur_expert, "ffn_moe_weighted", il);
6441
-
6442
- if (i == 0) {
6443
- moe_out = cur_expert;
6444
- } else {
6445
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6446
- cb(moe_out, "ffn_moe_out", il);
6447
- }
6448
- }
6449
-
6450
- cur = moe_out;
6805
+ cur = llm_build_moe_ffn(ctx0, cur,
6806
+ model.layers[il].ffn_gate_inp,
6807
+ model.layers[il].ffn_up_exps,
6808
+ model.layers[il].ffn_gate_exps,
6809
+ model.layers[il].ffn_down_exps,
6810
+ n_expert, n_expert_used,
6811
+ LLM_FFN_SILU, true,
6812
+ cb, il);
6813
+ cb(cur, "ffn_moe_out", il);
6451
6814
  }
6452
6815
 
6453
6816
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6926,63 +7289,15 @@ struct llm_build_context {
6926
7289
  LLM_NORM_RMS, cb, il);
6927
7290
  cb(cur, "ffn_norm", il);
6928
7291
 
6929
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6930
- cb(logits, "ffn_moe_logits", il);
6931
-
6932
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6933
- cb(probs, "ffn_moe_probs", il);
6934
-
6935
- // select experts
6936
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6937
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6938
-
6939
- ggml_tensor * weights = ggml_get_rows(ctx0,
6940
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6941
- cb(weights, "ffn_moe_weights", il);
6942
-
6943
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6944
-
6945
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6946
- cb(weights_sum, "ffn_moe_weights_sum", il);
6947
-
6948
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6949
- cb(weights, "ffn_moe_weights_norm", il);
6950
-
6951
- // compute expert outputs
6952
- ggml_tensor * moe_out = nullptr;
6953
-
6954
- for (int i = 0; i < n_expert_used; ++i) {
6955
- ggml_tensor * cur_expert;
6956
-
6957
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6958
- cb(cur_up, "ffn_moe_up", il);
6959
-
6960
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6961
- cb(cur_gate, "ffn_moe_gate", il);
6962
-
6963
- //GeLU
6964
- cur_gate = ggml_gelu(ctx0, cur_gate);
6965
- cb(cur_gate, "ffn_moe_gelu", il);
6966
-
6967
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6968
- cb(cur_expert, "ffn_moe_gate_par", il);
6969
-
6970
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6971
- cb(cur_expert, "ffn_moe_down", il);
6972
-
6973
- cur_expert = ggml_mul(ctx0, cur_expert,
6974
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6975
- cb(cur_expert, "ffn_moe_weighted", il);
6976
-
6977
- if (i == 0) {
6978
- moe_out = cur_expert;
6979
- } else {
6980
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6981
- cb(moe_out, "ffn_moe_out", il);
6982
- }
6983
- }
6984
-
6985
- cur = moe_out;
7292
+ cur = llm_build_moe_ffn(ctx0, cur,
7293
+ model.layers[il].ffn_gate_inp,
7294
+ model.layers[il].ffn_up_exps,
7295
+ model.layers[il].ffn_gate_exps,
7296
+ model.layers[il].ffn_down_exps,
7297
+ n_expert, n_expert_used,
7298
+ LLM_FFN_GELU, true,
7299
+ cb, il);
7300
+ cb(cur, "ffn_moe_out", il);
6986
7301
 
6987
7302
  // Grok
6988
7303
  // if layer_out_norm is present then apply it before adding the input
@@ -6994,7 +7309,6 @@ struct llm_build_context {
6994
7309
  cb(cur, "layer_out_norm", il);
6995
7310
  }
6996
7311
 
6997
-
6998
7312
  cur = ggml_add(ctx0, cur, ffn_inp);
6999
7313
  cb(cur, "ffn_out", il);
7000
7314
 
@@ -7030,12 +7344,16 @@ struct llm_build_context {
7030
7344
  return gf;
7031
7345
  }
7032
7346
 
7033
- struct ggml_cgraph * build_starcoder() {
7347
+ struct ggml_cgraph * build_dbrx() {
7034
7348
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7035
7349
 
7350
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7351
+ int32_t n_tokens = this->n_tokens;
7352
+
7036
7353
  const int64_t n_embd_head = hparams.n_embd_head_v;
7037
7354
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7038
7355
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7356
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7039
7357
 
7040
7358
  struct ggml_tensor * cur;
7041
7359
  struct ggml_tensor * inpL;
@@ -7048,16 +7366,140 @@ struct llm_build_context {
7048
7366
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7049
7367
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7050
7368
 
7051
- struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7052
- cb(pos, "pos_embd", -1);
7053
-
7054
- inpL = ggml_add(ctx0, inpL, pos);
7055
- cb(inpL, "inpL", -1);
7056
-
7057
7369
  for (int il = 0; il < n_layer; ++il) {
7370
+ struct ggml_tensor * inpSA = inpL;
7371
+
7372
+ // norm
7058
7373
  cur = llm_build_norm(ctx0, inpL, hparams,
7059
- model.layers[il].attn_norm,
7060
- model.layers[il].attn_norm_b,
7374
+ model.layers[il].attn_norm, NULL,
7375
+ LLM_NORM, cb, il);
7376
+ cb(cur, "attn_norm", il);
7377
+
7378
+ // self-attention
7379
+ {
7380
+ struct ggml_tensor * Qcur = nullptr;
7381
+ struct ggml_tensor * Kcur = nullptr;
7382
+ struct ggml_tensor * Vcur = nullptr;
7383
+
7384
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7385
+ cb(cur, "wqkv", il);
7386
+
7387
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7388
+ cb(cur, "wqkv_clamped", il);
7389
+
7390
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7391
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7392
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7393
+
7394
+ cb(Qcur, "Qcur", il);
7395
+ cb(Kcur, "Kcur", il);
7396
+ cb(Vcur, "Vcur", il);
7397
+
7398
+ Qcur = ggml_rope_custom(
7399
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7401
+ ext_factor, attn_factor, beta_fast, beta_slow
7402
+ );
7403
+ cb(Qcur, "Qcur", il);
7404
+
7405
+ Kcur = ggml_rope_custom(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7407
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7408
+ ext_factor, attn_factor, beta_fast, beta_slow
7409
+ );
7410
+ cb(Kcur, "Kcur", il);
7411
+
7412
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
+ model.layers[il].wo, NULL,
7414
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7415
+ }
7416
+
7417
+ if (il == n_layer - 1) {
7418
+ // skip computing output for unused tokens
7419
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7420
+ n_tokens = n_outputs;
7421
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7422
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7423
+ }
7424
+
7425
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7426
+ cb(ffn_inp, "ffn_inp", il);
7427
+
7428
+ // feed-forward network
7429
+ // MoE branch
7430
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7431
+ model.layers[il].attn_out_norm, NULL,
7432
+ LLM_NORM, cb, il);
7433
+ cb(cur, "attn_out_norm", il);
7434
+
7435
+ cur = llm_build_moe_ffn(ctx0, cur,
7436
+ model.layers[il].ffn_gate_inp,
7437
+ model.layers[il].ffn_up_exps,
7438
+ model.layers[il].ffn_gate_exps,
7439
+ model.layers[il].ffn_down_exps,
7440
+ n_expert, n_expert_used,
7441
+ LLM_FFN_SILU, true,
7442
+ cb, il);
7443
+ cb(cur, "ffn_moe_out", il);
7444
+
7445
+ cur = ggml_add(ctx0, cur, ffn_inp);
7446
+ cb(cur, "ffn_out", il);
7447
+
7448
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7449
+ if (layer_dir != nullptr) {
7450
+ cur = ggml_add(ctx0, cur, layer_dir);
7451
+ }
7452
+ cb(cur, "l_out", il);
7453
+
7454
+ // input for next layer
7455
+ inpL = cur;
7456
+ }
7457
+
7458
+ cur = inpL;
7459
+
7460
+ cur = llm_build_norm(ctx0, cur, hparams,
7461
+ model.output_norm, NULL,
7462
+ LLM_NORM, cb, -1);
7463
+ cb(cur, "result_norm", -1);
7464
+
7465
+ // lm_head
7466
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7467
+
7468
+ cb(cur, "result_output", -1);
7469
+
7470
+ ggml_build_forward_expand(gf, cur);
7471
+
7472
+ return gf;
7473
+ }
7474
+
7475
+ struct ggml_cgraph * build_starcoder() {
7476
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7477
+
7478
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7479
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7480
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7481
+
7482
+ struct ggml_tensor * cur;
7483
+ struct ggml_tensor * inpL;
7484
+
7485
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7486
+
7487
+ // inp_pos - contains the positions
7488
+ struct ggml_tensor * inp_pos = build_inp_pos();
7489
+
7490
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7491
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7492
+
7493
+ struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7494
+ cb(pos, "pos_embd", -1);
7495
+
7496
+ inpL = ggml_add(ctx0, inpL, pos);
7497
+ cb(inpL, "inpL", -1);
7498
+
7499
+ for (int il = 0; il < n_layer; ++il) {
7500
+ cur = llm_build_norm(ctx0, inpL, hparams,
7501
+ model.layers[il].attn_norm,
7502
+ model.layers[il].attn_norm_b,
7061
7503
  LLM_NORM, cb, il);
7062
7504
  cb(cur, "attn_norm", il);
7063
7505
 
@@ -7882,7 +8324,7 @@ struct llm_build_context {
7882
8324
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7883
8325
 
7884
8326
  for (int il = 0; il < n_layer; ++il) {
7885
- struct ggml_tensor * inpSA = inpL;
8327
+
7886
8328
 
7887
8329
  // norm
7888
8330
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7891,6 +8333,8 @@ struct llm_build_context {
7891
8333
  LLM_NORM, cb, il);
7892
8334
  cb(cur, "attn_norm", il);
7893
8335
 
8336
+ struct ggml_tensor * inpSA = cur;
8337
+
7894
8338
  // self-attention
7895
8339
  {
7896
8340
  // compute Q and K and RoPE them
@@ -7915,15 +8359,36 @@ struct llm_build_context {
7915
8359
  cb(Vcur, "Vcur", il);
7916
8360
  }
7917
8361
 
8362
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8363
+ cb(Qcur, "Qcur", il);
8364
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8365
+ cb(Kcur, "Kcur", il);
8366
+
8367
+ if (model.layers[il].attn_q_norm) {
8368
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8369
+ model.layers[il].attn_q_norm,
8370
+ NULL,
8371
+ LLM_NORM, cb, il);
8372
+ cb(Qcur, "Qcur", il);
8373
+ }
8374
+ if (model.layers[il].attn_k_norm) {
8375
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8376
+ model.layers[il].attn_k_norm,
8377
+ NULL,
8378
+ LLM_NORM, cb, il);
8379
+ cb(Kcur, "Kcur", il);
8380
+ }
8381
+
8382
+
7918
8383
  Qcur = ggml_rope_custom(
7919
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8384
+ ctx0, Qcur, inp_pos,
7920
8385
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7921
8386
  ext_factor, attn_factor, beta_fast, beta_slow
7922
8387
  );
7923
8388
  cb(Qcur, "Qcur", il);
7924
8389
 
7925
8390
  Kcur = ggml_rope_custom(
7926
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8391
+ ctx0, Kcur, inp_pos,
7927
8392
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7928
8393
  ext_factor, attn_factor, beta_fast, beta_slow
7929
8394
  );
@@ -7938,20 +8403,25 @@ struct llm_build_context {
7938
8403
  // skip computing output for unused tokens
7939
8404
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7940
8405
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8406
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7941
8407
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7942
8408
  }
7943
8409
 
7944
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8410
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7945
8411
  cb(ffn_inp, "ffn_inp", il);
7946
8412
 
7947
8413
  // feed-forward network
7948
8414
  {
7949
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7950
- model.layers[il].ffn_norm,
7951
- model.layers[il].ffn_norm_b,
7952
- LLM_NORM, cb, il);
7953
- cb(cur, "ffn_norm", il);
7954
-
8415
+ if (model.layers[il].ffn_norm) {
8416
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8417
+ model.layers[il].ffn_norm,
8418
+ model.layers[il].ffn_norm_b,
8419
+ LLM_NORM, cb, il);
8420
+ cb(cur, "ffn_norm", il);
8421
+ } else {
8422
+ // parallel residual
8423
+ cur = inpSA;
8424
+ }
7955
8425
  cur = llm_build_ffn(ctx0, cur,
7956
8426
  model.layers[il].ffn_up, NULL,
7957
8427
  model.layers[il].ffn_gate, NULL,
@@ -8141,12 +8611,6 @@ struct llm_build_context {
8141
8611
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8142
8612
  cb(Vcur, "Vcur", il);
8143
8613
 
8144
- // these nodes are added to the graph together so that they are not reordered
8145
- // by doing so, the number of splits in the graph is reduced
8146
- ggml_build_forward_expand(gf, Qcur);
8147
- ggml_build_forward_expand(gf, Kcur);
8148
- ggml_build_forward_expand(gf, Vcur);
8149
-
8150
8614
  Qcur = ggml_rope_custom(
8151
8615
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8152
8616
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8213,6 +8677,150 @@ struct llm_build_context {
8213
8677
  return gf;
8214
8678
  }
8215
8679
 
8680
+ struct ggml_cgraph * build_qwen2moe() {
8681
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8682
+
8683
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8684
+ int32_t n_tokens = this->n_tokens;
8685
+
8686
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8687
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8688
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8689
+
8690
+ struct ggml_tensor * cur;
8691
+ struct ggml_tensor * inpL;
8692
+
8693
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8694
+
8695
+ // inp_pos - contains the positions
8696
+ struct ggml_tensor * inp_pos = build_inp_pos();
8697
+
8698
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8699
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8700
+
8701
+ for (int il = 0; il < n_layer; ++il) {
8702
+ struct ggml_tensor * inpSA = inpL;
8703
+
8704
+ // norm
8705
+ cur = llm_build_norm(ctx0, inpL, hparams,
8706
+ model.layers[il].attn_norm, NULL,
8707
+ LLM_NORM_RMS, cb, il);
8708
+ cb(cur, "attn_norm", il);
8709
+
8710
+ // self_attention
8711
+ {
8712
+ // compute Q and K and RoPE them
8713
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8714
+ cb(Qcur, "Qcur", il);
8715
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8716
+ cb(Qcur, "Qcur", il);
8717
+
8718
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8719
+ cb(Kcur, "Kcur", il);
8720
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8721
+ cb(Kcur, "Kcur", il);
8722
+
8723
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8724
+ cb(Vcur, "Vcur", il);
8725
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8726
+ cb(Vcur, "Vcur", il);
8727
+
8728
+ Qcur = ggml_rope_custom(
8729
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8730
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8731
+ ext_factor, attn_factor, beta_fast, beta_slow
8732
+ );
8733
+ cb(Qcur, "Qcur", il);
8734
+
8735
+ Kcur = ggml_rope_custom(
8736
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8737
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8738
+ ext_factor, attn_factor, beta_fast, beta_slow
8739
+ );
8740
+ cb(Kcur, "Kcur", il);
8741
+
8742
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8743
+ model.layers[il].wo, model.layers[il].bo,
8744
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
+ }
8746
+
8747
+ if (il == n_layer - 1) {
8748
+ // skip computing output for unused tokens
8749
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8750
+ n_tokens = n_outputs;
8751
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8752
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8753
+ }
8754
+
8755
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8756
+ cb(ffn_inp, "ffn_inp", il);
8757
+
8758
+ // MoE branch
8759
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8760
+ model.layers[il].ffn_norm, NULL,
8761
+ LLM_NORM_RMS, cb, il);
8762
+ cb(cur, "ffn_norm", il);
8763
+
8764
+ ggml_tensor * moe_out =
8765
+ llm_build_moe_ffn(ctx0, cur,
8766
+ model.layers[il].ffn_gate_inp,
8767
+ model.layers[il].ffn_up_exps,
8768
+ model.layers[il].ffn_gate_exps,
8769
+ model.layers[il].ffn_down_exps,
8770
+ n_expert, n_expert_used,
8771
+ LLM_FFN_SILU, false,
8772
+ cb, il);
8773
+ cb(cur, "ffn_moe_out", il);
8774
+
8775
+ // FFN shared expert
8776
+ {
8777
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8778
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8779
+
8780
+ // sigmoid
8781
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8782
+ cb(cur_gate, "ffn_shexp_gate", il);
8783
+
8784
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8785
+ model.layers[il].ffn_up_shexp, NULL,
8786
+ model.layers[il].ffn_gate_shexp, NULL,
8787
+ model.layers[il].ffn_down_shexp, NULL,
8788
+ NULL,
8789
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8790
+ cb(cur_ffn, "ffn_shexp", il);
8791
+
8792
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8793
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8794
+
8795
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8796
+ cb(moe_out, "ffn_out", il);
8797
+
8798
+ cur = moe_out;
8799
+ }
8800
+
8801
+ cur = ggml_add(ctx0, cur, ffn_inp);
8802
+ cb(cur, "l_out", il);
8803
+
8804
+ // input for next layer
8805
+ inpL = cur;
8806
+ }
8807
+
8808
+ cur = inpL;
8809
+
8810
+ cur = llm_build_norm(ctx0, cur, hparams,
8811
+ model.output_norm, NULL,
8812
+ LLM_NORM_RMS, cb, -1);
8813
+ cb(cur, "result_norm", -1);
8814
+
8815
+ // lm_head
8816
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8817
+ cb(cur, "result_output", -1);
8818
+
8819
+ ggml_build_forward_expand(gf, cur);
8820
+
8821
+ return gf;
8822
+ }
8823
+
8216
8824
  struct ggml_cgraph * build_phi2() {
8217
8825
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8218
8826
 
@@ -9452,6 +10060,31 @@ struct llm_build_context {
9452
10060
  cb(Vcur, "Vcur", il);
9453
10061
  }
9454
10062
 
10063
+ if (model.layers[il].attn_q_norm) {
10064
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
10065
+ ggml_element_size(Qcur) * n_embd_head,
10066
+ ggml_element_size(Qcur) * n_embd_head * n_head,
10067
+ 0);
10068
+ cb(Qcur, "Qcur", il);
10069
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
10070
+ ggml_element_size(Kcur) * n_embd_head,
10071
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
10072
+ 0);
10073
+ cb(Kcur, "Kcur", il);
10074
+
10075
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
10076
+ model.layers[il].attn_q_norm,
10077
+ NULL,
10078
+ LLM_NORM, cb, il);
10079
+ cb(Qcur, "Qcur", il);
10080
+
10081
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
10082
+ model.layers[il].attn_k_norm,
10083
+ NULL,
10084
+ LLM_NORM, cb, il);
10085
+ cb(Kcur, "Kcur", il);
10086
+ }
10087
+
9455
10088
  Qcur = ggml_rope_custom(
9456
10089
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
9457
10090
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -9522,6 +10155,139 @@ struct llm_build_context {
9522
10155
  return gf;
9523
10156
 
9524
10157
  }
10158
+
10159
+ // ref: https://allenai.org/olmo
10160
+ // based on the original build_llama() function, changes:
10161
+ // * non-parametric layer norm
10162
+ // * clamp qkv
10163
+ // * removed bias
10164
+ // * removed MoE
10165
+ struct ggml_cgraph * build_olmo() {
10166
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10167
+
10168
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10169
+ int32_t n_tokens = this->n_tokens;
10170
+
10171
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10172
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10173
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10174
+
10175
+ struct ggml_tensor * cur;
10176
+ struct ggml_tensor * inpL;
10177
+
10178
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10179
+
10180
+ // inp_pos - contains the positions
10181
+ struct ggml_tensor * inp_pos = build_inp_pos();
10182
+
10183
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10184
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10185
+
10186
+ for (int il = 0; il < n_layer; ++il) {
10187
+ struct ggml_tensor * inpSA = inpL;
10188
+
10189
+ // norm
10190
+ cur = llm_build_norm(ctx0, inpL, hparams,
10191
+ NULL, NULL,
10192
+ LLM_NORM, cb, il);
10193
+ cb(cur, "attn_norm", il);
10194
+
10195
+ // self-attention
10196
+ {
10197
+ // compute Q and K and RoPE them
10198
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10199
+ cb(Qcur, "Qcur", il);
10200
+ if (hparams.f_clamp_kqv > 0.0f) {
10201
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10202
+ cb(Qcur, "Qcur", il);
10203
+ }
10204
+
10205
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10206
+ cb(Kcur, "Kcur", il);
10207
+ if (hparams.f_clamp_kqv > 0.0f) {
10208
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10209
+ cb(Kcur, "Kcur", il);
10210
+ }
10211
+
10212
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10213
+ cb(Vcur, "Vcur", il);
10214
+ if (hparams.f_clamp_kqv > 0.0f) {
10215
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10216
+ cb(Vcur, "Vcur", il);
10217
+ }
10218
+
10219
+ Qcur = ggml_rope_custom(
10220
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10221
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10222
+ ext_factor, attn_factor, beta_fast, beta_slow
10223
+ );
10224
+ cb(Qcur, "Qcur", il);
10225
+
10226
+ Kcur = ggml_rope_custom(
10227
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10228
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10229
+ ext_factor, attn_factor, beta_fast, beta_slow
10230
+ );
10231
+ cb(Kcur, "Kcur", il);
10232
+
10233
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10234
+ model.layers[il].wo, nullptr,
10235
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
+ }
10237
+
10238
+ if (il == n_layer - 1) {
10239
+ // skip computing output for unused tokens
10240
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10241
+ n_tokens = n_outputs;
10242
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10243
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10244
+ }
10245
+
10246
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10247
+ cb(ffn_inp, "ffn_inp", il);
10248
+
10249
+ // feed-forward network
10250
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10251
+ NULL, NULL,
10252
+ LLM_NORM, cb, il);
10253
+ cb(cur, "ffn_norm", il);
10254
+
10255
+ cur = llm_build_ffn(ctx0, cur,
10256
+ model.layers[il].ffn_up, NULL,
10257
+ model.layers[il].ffn_gate, NULL,
10258
+ model.layers[il].ffn_down, NULL,
10259
+ NULL,
10260
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10261
+ cb(cur, "ffn_out", il);
10262
+
10263
+ cur = ggml_add(ctx0, cur, ffn_inp);
10264
+ cb(cur, "ffn_out", il);
10265
+
10266
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10267
+ if (layer_dir != nullptr) {
10268
+ cur = ggml_add(ctx0, cur, layer_dir);
10269
+ }
10270
+ cb(cur, "l_out", il);
10271
+
10272
+ // input for next layer
10273
+ inpL = cur;
10274
+ }
10275
+
10276
+ cur = inpL;
10277
+
10278
+ cur = llm_build_norm(ctx0, cur, hparams,
10279
+ NULL, NULL,
10280
+ LLM_NORM, cb, -1);
10281
+ cb(cur, "result_norm", -1);
10282
+
10283
+ // lm_head
10284
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10285
+ cb(cur, "result_output", -1);
10286
+
10287
+ ggml_build_forward_expand(gf, cur);
10288
+
10289
+ return gf;
10290
+ }
9525
10291
  };
9526
10292
 
9527
10293
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9671,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
9671
10437
  {
9672
10438
  result = llm.build_qwen2();
9673
10439
  } break;
10440
+ case LLM_ARCH_QWEN2MOE:
10441
+ {
10442
+ result = llm.build_qwen2moe();
10443
+ } break;
9674
10444
  case LLM_ARCH_PHI2:
9675
10445
  {
9676
10446
  result = llm.build_phi2();
@@ -9715,9 +10485,17 @@ static struct ggml_cgraph * llama_build_graph(
9715
10485
  {
9716
10486
  result = llm.build_xverse();
9717
10487
  } break;
9718
- case LLM_ARCH_COMMAND_R:
10488
+ case LLM_ARCH_COMMAND_R:
10489
+ {
10490
+ result = llm.build_command_r();
10491
+ } break;
10492
+ case LLM_ARCH_DBRX:
10493
+ {
10494
+ result = llm.build_dbrx();
10495
+ } break;
10496
+ case LLM_ARCH_OLMO:
9719
10497
  {
9720
- result = llm.build_command_r();
10498
+ result = llm.build_olmo();
9721
10499
  } break;
9722
10500
  default:
9723
10501
  GGML_ASSERT(false);
@@ -10409,6 +11187,9 @@ static int llama_decode_internal(
10409
11187
  n_outputs_prev += lctx.n_outputs;
10410
11188
  }
10411
11189
 
11190
+ // set to total number of outputs in the batch, for use in llama_get_logits_ith
11191
+ lctx.n_outputs = n_outputs;
11192
+
10412
11193
  // wait for the computation to finish (automatically done when obtaining the model output)
10413
11194
  //llama_synchronize(&lctx);
10414
11195
 
@@ -11052,7 +11833,7 @@ struct llm_tokenizer_bpe {
11052
11833
  add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
11053
11834
  }
11054
11835
 
11055
- // add the fnished tokens to the final list keeping correct order for next and prev
11836
+ // add the finished tokens to the final list keeping correct order for next and prev
11056
11837
  for (auto & sym : symbols) {
11057
11838
  if (sym.n > 0) {
11058
11839
  sym.prev = final_prev_index;
@@ -11321,9 +12102,6 @@ struct llm_tokenizer_wpm {
11321
12102
  output.push_back(vocab.special_unk_id);
11322
12103
  }
11323
12104
  }
11324
-
11325
- // append eos token
11326
- output.push_back(vocab.special_eos_id);
11327
12105
  }
11328
12106
 
11329
12107
  std::vector<std::string> preprocess(const std::string & text) {
@@ -11528,30 +12306,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
11528
12306
  }
11529
12307
  }
11530
12308
 
11531
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
12309
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
11532
12310
  std::vector<llama_vocab::id> output;
11533
-
11534
- // OG tokenizer behavior:
11535
- //
11536
- // tokenizer.encode('', add_bos=True) returns [1]
11537
- // tokenizer.encode('', add_bos=False) returns []
11538
-
11539
- if (bos && vocab.special_bos_id != -1) {
11540
- output.push_back(vocab.special_bos_id);
11541
- }
11542
-
11543
- if (raw_text.empty()) {
11544
- return output;
11545
- }
11546
-
11547
12311
  std::forward_list<fragment_buffer_variant> fragment_buffer;
11548
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
11549
12312
 
11550
- if (special) tokenizer_st_partition(vocab, fragment_buffer);
12313
+ if (!raw_text.empty()) {
12314
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
12315
+ if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
12316
+ }
11551
12317
 
11552
12318
  switch (vocab.type) {
11553
12319
  case LLAMA_VOCAB_TYPE_SPM:
11554
12320
  {
12321
+ // OG tokenizer behavior:
12322
+ //
12323
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
12324
+ // tokenizer.encode('', add_special_tokens=False) returns []
12325
+
12326
+ if (add_special && vocab.special_add_bos != 0) {
12327
+ GGML_ASSERT(vocab.special_bos_id != -1);
12328
+ output.push_back(vocab.special_bos_id);
12329
+ }
12330
+
11555
12331
  for (const auto & fragment : fragment_buffer) {
11556
12332
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11557
12333
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -11577,9 +12353,19 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11577
12353
  output.push_back(fragment.token);
11578
12354
  }
11579
12355
  }
12356
+
12357
+ if (add_special && vocab.special_add_eos == 1) {
12358
+ GGML_ASSERT(vocab.special_eos_id != -1);
12359
+ output.push_back(vocab.special_eos_id);
12360
+ }
11580
12361
  } break;
11581
12362
  case LLAMA_VOCAB_TYPE_BPE:
11582
12363
  {
12364
+ if (add_special && vocab.special_add_bos == 1) {
12365
+ GGML_ASSERT(vocab.special_bos_id != -1);
12366
+ output.push_back(vocab.special_bos_id);
12367
+ }
12368
+
11583
12369
  for (const auto & fragment : fragment_buffer) {
11584
12370
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11585
12371
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11593,9 +12379,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11593
12379
  output.push_back(fragment.token);
11594
12380
  }
11595
12381
  }
12382
+
12383
+ GGML_ASSERT(vocab.special_add_eos != 1);
11596
12384
  } break;
11597
12385
  case LLAMA_VOCAB_TYPE_WPM:
11598
12386
  {
12387
+ if (add_special) {
12388
+ GGML_ASSERT(vocab.special_cls_id != -1);
12389
+ output.push_back(vocab.special_cls_id);
12390
+ }
12391
+
11599
12392
  for (const auto & fragment : fragment_buffer) {
11600
12393
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
11601
12394
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -11609,6 +12402,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11609
12402
  output.push_back(fragment.token);
11610
12403
  }
11611
12404
  }
12405
+
12406
+ if (add_special) {
12407
+ GGML_ASSERT(vocab.special_sep_id != -1);
12408
+ output.push_back(vocab.special_sep_id);
12409
+ }
11612
12410
  } break;
11613
12411
  case LLAMA_VOCAB_TYPE_NONE:
11614
12412
  GGML_ASSERT(false);
@@ -11775,7 +12573,9 @@ static void llama_grammar_advance_stack(
11775
12573
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11776
12574
 
11777
12575
  if (stack.empty()) {
11778
- new_stacks.emplace_back(stack);
12576
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
12577
+ new_stacks.emplace_back(stack);
12578
+ }
11779
12579
  return;
11780
12580
  }
11781
12581
 
@@ -11812,7 +12612,10 @@ static void llama_grammar_advance_stack(
11812
12612
  }
11813
12613
  case LLAMA_GRETYPE_CHAR:
11814
12614
  case LLAMA_GRETYPE_CHAR_NOT:
11815
- new_stacks.emplace_back(stack);
12615
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
12616
+ // only add the stack if it's not a duplicate of one we already have
12617
+ new_stacks.emplace_back(stack);
12618
+ }
11816
12619
  break;
11817
12620
  default:
11818
12621
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -11826,12 +12629,13 @@ static void llama_grammar_advance_stack(
11826
12629
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11827
12630
  // produces the N possible stacks if the given char is accepted at those
11828
12631
  // positions
11829
- std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
12632
+ void llama_grammar_accept(
11830
12633
  const std::vector<std::vector<llama_grammar_element>> & rules,
11831
12634
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11832
- const uint32_t chr) {
12635
+ const uint32_t chr,
12636
+ std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
11833
12637
 
11834
- std::vector<std::vector<const llama_grammar_element *>> new_stacks;
12638
+ new_stacks.clear();
11835
12639
 
11836
12640
  for (const auto & stack : stacks) {
11837
12641
  if (stack.empty()) {
@@ -11850,8 +12654,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11850
12654
  llama_grammar_advance_stack(rules, new_stack, new_stacks);
11851
12655
  }
11852
12656
  }
11853
-
11854
- return new_stacks;
11855
12657
  }
11856
12658
 
11857
12659
  static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@@ -11865,6 +12667,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11865
12667
  const std::vector<llama_grammar_candidate> & candidates) {
11866
12668
 
11867
12669
  std::vector<llama_grammar_candidate> rejects;
12670
+ rejects.reserve(candidates.size());
11868
12671
 
11869
12672
  if (stack.empty()) {
11870
12673
  for (const auto & tok : candidates) {
@@ -11878,6 +12681,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
11878
12681
  const llama_grammar_element * stack_pos = stack.back();
11879
12682
 
11880
12683
  std::vector<llama_grammar_candidate> next_candidates;
12684
+ next_candidates.reserve(candidates.size());
12685
+
11881
12686
  for (const auto & tok : candidates) {
11882
12687
  if (*tok.code_points == 0) {
11883
12688
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -12685,8 +13490,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12685
13490
  // Note terminating 0 in decoded string
12686
13491
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
12687
13492
  const auto & code_points = decoded.first;
13493
+ std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
12688
13494
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
12689
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
13495
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
13496
+ grammar->stacks = tmp_new_stacks;
12690
13497
  }
12691
13498
  grammar->partial_utf8 = decoded.second;
12692
13499
  GGML_ASSERT(!grammar->stacks.empty());
@@ -12820,6 +13627,11 @@ struct llama_beam_search_data {
12820
13627
  }
12821
13628
  llama_logit_info logit_info(ctx);
12822
13629
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13630
+
13631
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13632
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13633
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13634
+
12823
13635
  size_t i=0;
12824
13636
  if (next_beams.size() < n_beams) {
12825
13637
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13318,9 +14130,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
13318
14130
  return new_type;
13319
14131
  }
13320
14132
 
13321
- static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
14133
+ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
13322
14134
  std::mutex mutex;
13323
- int counter = 0;
14135
+ int64_t counter = 0;
13324
14136
  size_t new_size = 0;
13325
14137
  if (nthread < 2) {
13326
14138
  // single-thread
@@ -13328,11 +14140,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13328
14140
  }
13329
14141
  auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
13330
14142
  nrows, n_per_row, imatrix]() {
13331
- const int nrows_per_chunk = chunk_size / n_per_row;
14143
+ const int64_t nrows_per_chunk = chunk_size / n_per_row;
13332
14144
  size_t local_size = 0;
13333
14145
  while (true) {
13334
14146
  std::unique_lock<std::mutex> lock(mutex);
13335
- int first_row = counter; counter += nrows_per_chunk;
14147
+ int64_t first_row = counter; counter += nrows_per_chunk;
13336
14148
  if (first_row >= nrows) {
13337
14149
  if (local_size > 0) {
13338
14150
  new_size += local_size;
@@ -13340,7 +14152,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
13340
14152
  break;
13341
14153
  }
13342
14154
  lock.unlock();
13343
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
14155
+ const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
13344
14156
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
13345
14157
  }
13346
14158
  };
@@ -13440,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13440
14252
  gguf_set_kv (ctx_out, ml.meta);
13441
14253
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13442
14254
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14255
+ // Remove split metadata
14256
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14257
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14258
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13443
14259
 
13444
14260
  if (params->kv_overrides) {
13445
14261
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -13463,7 +14279,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13463
14279
  const std::string name = ggml_get_name(meta);
13464
14280
 
13465
14281
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13466
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
14282
+ if (name.find("attn_v.weight") != std::string::npos ||
14283
+ name.find("attn_qkv.weight") != std::string::npos) {
13467
14284
  ++qs.n_attention_wv;
13468
14285
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13469
14286
  qs.has_output = true;
@@ -13473,7 +14290,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13473
14290
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13474
14291
 
13475
14292
  // sanity checks
13476
- GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
14293
+ //
14294
+ // - qs.n_attention_wv == 0 for Mamba models
14295
+ // - qs.n_attention_wv == model.hparams.n_layer for Transformer models
14296
+ //
14297
+ GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
13477
14298
 
13478
14299
  size_t total_size_org = 0;
13479
14300
  size_t total_size_new = 0;
@@ -13529,6 +14350,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13529
14350
 
13530
14351
  // quantize only 2D and 3D tensors (experts)
13531
14352
  quantize &= (ggml_n_dims(tensor) >= 2);
14353
+
14354
+ // do not quantize norm tensors
14355
+ quantize &= name.find("_norm.weight") == std::string::npos;
14356
+
13532
14357
  quantize &= params->quantize_output_tensor || name != "output.weight";
13533
14358
  quantize &= !params->only_copy;
13534
14359
 
@@ -13557,10 +14382,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13557
14382
  if (!params->pure && ggml_is_quantized(default_type)) {
13558
14383
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
13559
14384
  }
13560
- else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
14385
+ if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
13561
14386
  new_type = params->token_embedding_type;
13562
14387
  }
13563
- else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
14388
+ if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
13564
14389
  new_type = params->output_tensor_type;
13565
14390
  }
13566
14391
 
@@ -13575,7 +14400,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13575
14400
  new_size = ggml_nbytes(tensor);
13576
14401
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
13577
14402
  } else {
13578
- const size_t nelements = ggml_nelements(tensor);
14403
+ const int64_t nelements = ggml_nelements(tensor);
13579
14404
 
13580
14405
  const float * imatrix = nullptr;
13581
14406
  if (imatrix_data) {
@@ -13627,20 +14452,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13627
14452
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
13628
14453
  fflush(stdout);
13629
14454
 
13630
- if (work.size() < nelements * 4) {
14455
+ if (work.size() < (size_t)nelements * 4) {
13631
14456
  work.resize(nelements * 4); // upper bound on size
13632
14457
  }
13633
14458
  new_data = work.data();
13634
14459
 
13635
- const int n_per_row = tensor->ne[0];
13636
- const int nrows = tensor->ne[1];
14460
+ const int64_t n_per_row = tensor->ne[0];
14461
+ const int64_t nrows = tensor->ne[1];
13637
14462
 
13638
- static const int min_chunk_size = 32 * 512;
13639
- const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
14463
+ static const int64_t min_chunk_size = 32 * 512;
14464
+ const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13640
14465
 
13641
- const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13642
- const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13643
- const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
14466
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
14467
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
14468
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
13644
14469
 
13645
14470
  // quantize each expert separately since they have different importance matrices
13646
14471
  new_size = 0;
@@ -14525,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14525
15350
  case LLM_ARCH_MINICPM:
14526
15351
  case LLM_ARCH_XVERSE:
14527
15352
  case LLM_ARCH_COMMAND_R:
15353
+ case LLM_ARCH_OLMO:
14528
15354
  return LLAMA_ROPE_TYPE_NORM;
14529
15355
 
14530
15356
  // the pairs of head values are offset by n_rot/2
14531
15357
  case LLM_ARCH_FALCON:
14532
15358
  case LLM_ARCH_GROK:
15359
+ case LLM_ARCH_DBRX:
14533
15360
  case LLM_ARCH_PERSIMMON:
14534
15361
  case LLM_ARCH_BERT:
14535
15362
  case LLM_ARCH_NOMIC_BERT:
14536
15363
  case LLM_ARCH_STABLELM:
14537
15364
  case LLM_ARCH_QWEN:
14538
15365
  case LLM_ARCH_QWEN2:
15366
+ case LLM_ARCH_QWEN2MOE:
14539
15367
  case LLM_ARCH_PHI2:
14540
15368
  case LLM_ARCH_GEMMA:
14541
15369
  case LLM_ARCH_STARCODER2:
@@ -14905,9 +15733,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
14905
15733
  llama_kv_cache_update_internal(*ctx);
14906
15734
  }
14907
15735
 
15736
+ // deprecated
15737
+ size_t llama_get_state_size(const struct llama_context * ctx) {
15738
+ return llama_state_get_size(ctx);
15739
+ }
15740
+
15741
+ // deprecated
15742
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
15743
+ return llama_state_get_data(ctx, dst);
15744
+ }
15745
+
15746
+ // deprecated
15747
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15748
+ return llama_state_set_data(ctx, src);
15749
+ }
15750
+
15751
+ // deprecated
15752
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15753
+ return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15754
+ }
15755
+
15756
+ // deprecated
15757
+ bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15758
+ return llama_state_save_file(ctx, path_session, tokens, n_token_count);
15759
+ }
14908
15760
 
14909
15761
  // Returns the *maximum* size of the state
14910
- size_t llama_get_state_size(const struct llama_context * ctx) {
15762
+ size_t llama_state_get_size(const struct llama_context * ctx) {
14911
15763
  const auto & cparams = ctx->cparams;
14912
15764
  const auto & hparams = ctx->model.hparams;
14913
15765
 
@@ -14995,15 +15847,15 @@ struct llama_data_file_context : llama_data_context {
14995
15847
  * file context:
14996
15848
  * llama_file file("/path", "wb");
14997
15849
  * llama_data_file_context data_ctx(&file);
14998
- * llama_copy_state_data(ctx, &data_ctx);
15850
+ * llama_state_get_data(ctx, &data_ctx);
14999
15851
  *
15000
15852
  * buffer context:
15001
15853
  * std::vector<uint8_t> buf(max_size, 0);
15002
15854
  * llama_data_buffer_context data_ctx(&buf.data());
15003
- * llama_copy_state_data(ctx, &data_ctx);
15855
+ * llama_state_get_data(ctx, &data_ctx);
15004
15856
  *
15005
15857
  */
15006
- static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15858
+ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
15007
15859
  // copy rng
15008
15860
  {
15009
15861
  std::ostringstream rng_ss;
@@ -15147,15 +15999,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
15147
15999
  }
15148
16000
  }
15149
16001
 
15150
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
16002
+ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15151
16003
  llama_data_buffer_context data_ctx(dst);
15152
- llama_copy_state_data_internal(ctx, &data_ctx);
16004
+ llama_state_get_data_internal(ctx, &data_ctx);
15153
16005
 
15154
16006
  return data_ctx.get_size_written();
15155
16007
  }
15156
16008
 
15157
16009
  // Sets the state reading from the specified source address
15158
- size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
16010
+ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15159
16011
  const uint8_t * inp = src;
15160
16012
 
15161
16013
  // set rng
@@ -15192,6 +16044,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15192
16044
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15193
16045
  ctx->output_ids[id] = i;
15194
16046
  }
16047
+
16048
+ ctx->n_outputs = n_outputs;
15195
16049
  }
15196
16050
  }
15197
16051
 
@@ -15307,14 +16161,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
15307
16161
  }
15308
16162
 
15309
16163
  const size_t nread = inp - src;
15310
- const size_t max_size = llama_get_state_size(ctx);
16164
+ const size_t max_size = llama_state_get_size(ctx);
15311
16165
 
15312
16166
  GGML_ASSERT(nread <= max_size);
15313
16167
 
15314
16168
  return nread;
15315
16169
  }
15316
16170
 
15317
- static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16171
+ static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15318
16172
  llama_file file(path_session, "rb");
15319
16173
 
15320
16174
  // sanity checks
@@ -15352,7 +16206,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15352
16206
  // restore the context state
15353
16207
  {
15354
16208
  const size_t n_state_size_cur = file.size - file.tell();
15355
- const size_t n_state_size_max = llama_get_state_size(ctx);
16209
+ const size_t n_state_size_max = llama_state_get_size(ctx);
15356
16210
 
15357
16211
  if (n_state_size_cur > n_state_size_max) {
15358
16212
  LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
@@ -15362,22 +16216,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
15362
16216
  std::vector<uint8_t> state_data(n_state_size_max);
15363
16217
  file.read_raw(state_data.data(), n_state_size_cur);
15364
16218
 
15365
- llama_set_state_data(ctx, state_data.data());
16219
+ llama_state_set_data(ctx, state_data.data());
15366
16220
  }
15367
16221
 
15368
16222
  return true;
15369
16223
  }
15370
16224
 
15371
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16225
+ bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
15372
16226
  try {
15373
- return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
16227
+ return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
15374
16228
  } catch (const std::exception & err) {
15375
16229
  LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
15376
16230
  return false;
15377
16231
  }
15378
16232
  }
15379
16233
 
15380
- bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
16234
+ static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
15381
16235
  llama_file file(path_session, "wb");
15382
16236
 
15383
16237
  file.write_u32(LLAMA_SESSION_MAGIC);
@@ -15391,11 +16245,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
15391
16245
 
15392
16246
  // save the context state using stream saving
15393
16247
  llama_data_file_context data_ctx(&file);
15394
- llama_copy_state_data_internal(ctx, &data_ctx);
16248
+ llama_state_get_data_internal(ctx, &data_ctx);
15395
16249
 
15396
16250
  return true;
15397
16251
  }
15398
16252
 
16253
+ bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
16254
+ try {
16255
+ return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
16256
+ } catch (const std::exception & err) {
16257
+ LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
16258
+ return false;
16259
+ }
16260
+ }
16261
+
16262
+ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
16263
+ // save the size of size_t as a uint32_t for safety check
16264
+ const size_t size_t_size_size = sizeof(uint32_t);
16265
+
16266
+ // other values
16267
+ const size_t s_cell_count_size = sizeof(uint32_t);
16268
+ const size_t s_layer_count_size = sizeof(uint32_t);
16269
+ const size_t n_embd_v_gqa_size = sizeof(uint32_t);
16270
+
16271
+ size_t s_cell_count = 0;
16272
+ size_t s_cell_data_size = 0;
16273
+ const auto & kv_self = ctx->kv_self;
16274
+ const auto & hparams = ctx->model.hparams;
16275
+
16276
+ const uint32_t n_layer = hparams.n_layer;
16277
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16278
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16279
+
16280
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
16281
+ const auto & cell = kv_self.cells[i];
16282
+ if (cell.seq_id.count(seq_id) > 0) {
16283
+ ++s_cell_count;
16284
+ s_cell_data_size += sizeof(llama_pos);
16285
+ }
16286
+ }
16287
+
16288
+ for (int il = 0; il < (int)n_layer; ++il) {
16289
+ // types of keys and values
16290
+ s_cell_data_size += sizeof(int32_t) * 2;
16291
+ // k_size_row and v_size_el values of layer
16292
+ s_cell_data_size += sizeof(size_t) * 2;
16293
+
16294
+ // keys
16295
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16296
+ s_cell_data_size += k_size_row * s_cell_count;
16297
+
16298
+ // values (transposed)
16299
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16300
+ s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
16301
+ }
16302
+
16303
+ const size_t s_total = (
16304
+ size_t_size_size +
16305
+ s_cell_count_size +
16306
+ s_layer_count_size +
16307
+ n_embd_v_gqa_size +
16308
+ s_cell_data_size
16309
+ );
16310
+
16311
+ return s_total;
16312
+ }
16313
+
16314
+ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16315
+ const auto & kv_self = ctx->kv_self;
16316
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
16317
+
16318
+ // Save the size of size_t as a uint32_t for safety check
16319
+ const uint32_t size_t_size = sizeof(size_t);
16320
+ data_ctx.write(&size_t_size, sizeof(size_t_size));
16321
+
16322
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
16323
+ uint32_t cell_count = 0;
16324
+
16325
+ // Count the number of cells with the specified seq_id
16326
+ // Find all the ranges of cells with this seq id
16327
+ {
16328
+ uint32_t cell_range_begin = kv_self.size;
16329
+ for (uint32_t i = 0; i < kv_self.size; ++i) {
16330
+ const auto & cell = kv_self.cells[i];
16331
+ if (cell.has_seq_id(seq_id)) {
16332
+ ++cell_count;
16333
+ if (cell_range_begin == kv_self.size) {
16334
+ cell_range_begin = i;
16335
+ }
16336
+ }
16337
+ else {
16338
+ if (cell_range_begin != kv_self.size) {
16339
+ cell_ranges.push_back({ cell_range_begin, i });
16340
+ cell_range_begin = kv_self.size;
16341
+ }
16342
+ }
16343
+ }
16344
+ if (cell_range_begin != kv_self.size) {
16345
+ cell_ranges.push_back({ cell_range_begin, kv_self.size });
16346
+ }
16347
+
16348
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
16349
+ uint32_t cell_count_check = 0;
16350
+ for (const auto & range : cell_ranges) {
16351
+ cell_count_check += range.second - range.first;
16352
+ }
16353
+ GGML_ASSERT(cell_count == cell_count_check);
16354
+ }
16355
+
16356
+ // Write the cell count
16357
+ data_ctx.write(&cell_count, sizeof(cell_count));
16358
+
16359
+ const auto & hparams = ctx->model.hparams;
16360
+ const uint32_t n_layer = hparams.n_layer;
16361
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16362
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16363
+
16364
+ // Write the layer count
16365
+ data_ctx.write(&n_layer, sizeof(n_layer));
16366
+
16367
+ // Write n_embd_v_gqa
16368
+ data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
16369
+
16370
+ // Iterate the ranges and write all the pos (this is the token position in the prompt)
16371
+ for (const auto & range : cell_ranges) {
16372
+ for (uint32_t i = range.first; i < range.second; ++i) {
16373
+ const auto & cell = kv_self.cells[i];
16374
+ data_ctx.write(&cell.pos, sizeof(cell.pos));
16375
+ }
16376
+ }
16377
+
16378
+ // Iterate and write all the keys first, each row is a cell
16379
+ // Get whole range at a time
16380
+ std::vector<uint8_t> tmp_buf;
16381
+ for (int il = 0; il < (int)n_layer; ++il) {
16382
+ // Write key type
16383
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
16384
+ data_ctx.write(&k_type_i, sizeof(k_type_i));
16385
+
16386
+ // Write row size of key
16387
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16388
+ data_ctx.write(&k_size_row, sizeof(k_size_row));
16389
+
16390
+ // Read each range of cells of k_size length each into tmp_buf and write out
16391
+ for (const auto & range : cell_ranges) {
16392
+ const size_t range_size = range.second - range.first;
16393
+ tmp_buf.resize(range_size * k_size_row);
16394
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
16395
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16396
+ }
16397
+ }
16398
+
16399
+ // For the values, they are transposed, so we also need the element size and get the element ranges from each row
16400
+ const uint32_t kv_size = kv_self.size;
16401
+ for (int il = 0; il < (int)n_layer; ++il) {
16402
+ // Write value type
16403
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16404
+ data_ctx.write(&v_type_i, sizeof(v_type_i));
16405
+
16406
+ // Write element size
16407
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16408
+ data_ctx.write(&v_size_el, sizeof(v_size_el));
16409
+
16410
+ // For each row, we get the element values of each cell
16411
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16412
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
16413
+ for (const auto & range : cell_ranges) {
16414
+ const size_t range_size = range.second - range.first;
16415
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
16416
+ tmp_buf.resize(range_size * v_size_el);
16417
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
16418
+ data_ctx.write(tmp_buf.data(), tmp_buf.size());
16419
+ }
16420
+ }
16421
+ }
16422
+
16423
+ return data_ctx.get_size_written();
16424
+ }
16425
+
16426
+ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
16427
+ llama_data_buffer_context data_ctx(dst);
16428
+ return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
16429
+ }
16430
+
16431
+ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16432
+ auto & kv_self = ctx->kv_self;
16433
+ GGML_ASSERT(!kv_self.recurrent); // not implemented
16434
+
16435
+ // Wipe the slot
16436
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16437
+
16438
+ const uint8_t * inp = src;
16439
+
16440
+ // Read size of size_t
16441
+ uint32_t size_t_size;
16442
+ memcpy(&size_t_size, inp, sizeof(size_t_size));
16443
+ inp += sizeof(size_t_size);
16444
+ if (size_t_size != sizeof(size_t)) {
16445
+ LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
16446
+ return 0;
16447
+ }
16448
+
16449
+ // Read the cell count
16450
+ uint32_t cell_count;
16451
+ memcpy(&cell_count, inp, sizeof(cell_count));
16452
+ inp += sizeof(cell_count);
16453
+
16454
+ // Read the layer count
16455
+ uint32_t n_layer_ref;
16456
+ memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
16457
+ inp += sizeof(n_layer_ref);
16458
+
16459
+ // Read n_embd_v_gqa
16460
+ uint32_t n_embd_v_gqa_ref;
16461
+ memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
16462
+ inp += sizeof(n_embd_v_gqa_ref);
16463
+
16464
+ // Sanity check model compatibility
16465
+ const auto & hparams = ctx->model.hparams;
16466
+ const uint32_t n_layer = hparams.n_layer;
16467
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
16468
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
16469
+ if (n_layer != n_layer_ref) {
16470
+ LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
16471
+ return 0;
16472
+ }
16473
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
16474
+ LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
16475
+ return 0;
16476
+ }
16477
+
16478
+ // Allocate the new cells for the slot
16479
+ if (cell_count) {
16480
+ llama_batch batch = llama_batch_init(cell_count, 0, 1);
16481
+ batch.n_tokens = cell_count;
16482
+ for (uint32_t i = 0; i < cell_count; ++i) {
16483
+ llama_pos pos;
16484
+ memcpy(&pos, inp, sizeof(pos));
16485
+ inp += sizeof(pos);
16486
+
16487
+ batch.pos[i] = pos;
16488
+ batch.n_seq_id[i] = 1;
16489
+ batch.seq_id[i][0] = dest_seq_id;
16490
+ }
16491
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
16492
+ llama_batch_free(batch);
16493
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
16494
+ return 0;
16495
+ }
16496
+
16497
+ // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
16498
+ // Assume that this is one contiguous block of cells
16499
+ GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
16500
+ GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
16501
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
16502
+ GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
16503
+ GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
16504
+
16505
+ // Cleanup
16506
+ llama_batch_free(batch);
16507
+ }
16508
+
16509
+ const uint32_t kv_size = kv_self.size;
16510
+ const uint32_t kv_head = kv_self.head;
16511
+
16512
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
16513
+ for (int il = 0; il < (int)n_layer; ++il) {
16514
+ // Read type of key
16515
+ int32_t k_type_i_ref;
16516
+ memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
16517
+ inp += sizeof(k_type_i_ref);
16518
+ const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
16519
+ if (k_type_i != k_type_i_ref) {
16520
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16521
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
16522
+ return 0;
16523
+ }
16524
+
16525
+ // Read row size of key
16526
+ size_t k_size_row_ref;
16527
+ memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
16528
+ inp += sizeof(k_size_row_ref);
16529
+ const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
16530
+ if (k_size_row != k_size_row_ref) {
16531
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16532
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
16533
+ return 0;
16534
+ }
16535
+
16536
+ if (cell_count) {
16537
+ // Read and set the keys for the whole cell range
16538
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
16539
+ inp += cell_count * k_size_row;
16540
+ }
16541
+ }
16542
+
16543
+ // For each layer, read the values for each cell (transposed)
16544
+ for (int il = 0; il < (int)n_layer; ++il) {
16545
+ // Read type of value
16546
+ int32_t v_type_i_ref;
16547
+ memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
16548
+ inp += sizeof(v_type_i_ref);
16549
+ const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
16550
+ if (v_type_i != v_type_i_ref) {
16551
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16552
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
16553
+ return 0;
16554
+ }
16555
+
16556
+ // Read element size of value
16557
+ size_t v_size_el_ref;
16558
+ memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
16559
+ inp += sizeof(v_size_el_ref);
16560
+ const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
16561
+ if (v_size_el != v_size_el_ref) {
16562
+ llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
16563
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
16564
+ return 0;
16565
+ }
16566
+
16567
+ if (cell_count) {
16568
+ // For each row in the transposed matrix, read the values for the whole cell range
16569
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
16570
+ const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
16571
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
16572
+ inp += cell_count * v_size_el;
16573
+ }
16574
+ }
16575
+ }
16576
+
16577
+ const size_t nread = inp - src;
16578
+ return nread;
16579
+ }
16580
+
16581
+ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
16582
+ llama_file file(filepath, "wb");
16583
+
16584
+ file.write_u32(LLAMA_STATE_SEQ_MAGIC);
16585
+ file.write_u32(LLAMA_STATE_SEQ_VERSION);
16586
+
16587
+ // save the prompt
16588
+ file.write_u32((uint32_t)n_token_count);
16589
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
16590
+
16591
+ // save the context state using stream saving
16592
+ llama_data_file_context data_ctx(&file);
16593
+ llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
16594
+
16595
+ const size_t res = file.tell();
16596
+ GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
16597
+ return res;
16598
+ }
16599
+
16600
+ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16601
+ llama_file file(filepath, "rb");
16602
+
16603
+ // version checks
16604
+ {
16605
+ const uint32_t magic = file.read_u32();
16606
+ const uint32_t version = file.read_u32();
16607
+
16608
+ if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
16609
+ LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
16610
+ return 0;
16611
+ }
16612
+ }
16613
+
16614
+ // load the prompt
16615
+ {
16616
+ const uint32_t n_token_count = file.read_u32();
16617
+
16618
+ if (n_token_count > n_token_capacity) {
16619
+ LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
16620
+ return 0;
16621
+ }
16622
+
16623
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
16624
+ *n_token_count_out = n_token_count;
16625
+ }
16626
+
16627
+ // restore the context state
16628
+ {
16629
+ const size_t state_size = file.size - file.tell();
16630
+ std::vector<uint8_t> state_data(state_size);
16631
+ file.read_raw(state_data.data(), state_size);
16632
+ const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
16633
+ if (!nread) {
16634
+ LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
16635
+ return 0;
16636
+ }
16637
+ GGML_ASSERT(nread <= state_size);
16638
+ GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
16639
+ }
16640
+
16641
+ return file.tell();
16642
+ }
16643
+
16644
+ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
16645
+ try {
16646
+ return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
16647
+ } catch (const std::exception & err) {
16648
+ LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
16649
+ return 0;
16650
+ }
16651
+ }
16652
+
16653
+ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
16654
+ try {
16655
+ return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
16656
+ } catch (const std::exception & err) {
16657
+ LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
16658
+ return 0;
16659
+ }
16660
+ }
16661
+
15399
16662
  void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
15400
16663
  ctx->cparams.n_threads = n_threads;
15401
16664
  ctx->cparams.n_threads_batch = n_threads_batch;
@@ -15509,23 +16772,31 @@ float * llama_get_logits(struct llama_context * ctx) {
15509
16772
  }
15510
16773
 
15511
16774
  float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
16775
+ int32_t j = -1;
15512
16776
  llama_synchronize(ctx);
15513
16777
 
15514
16778
  try {
15515
16779
  if (ctx->logits == nullptr) {
15516
16780
  throw std::runtime_error("no logits");
15517
16781
  }
15518
- if ((size_t) i >= ctx->output_ids.size()) {
16782
+
16783
+ if (i < 0) {
16784
+ j = ctx->n_outputs + i;
16785
+ if (j < 0) {
16786
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16787
+ }
16788
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15519
16789
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16790
+ } else {
16791
+ j = ctx->output_ids[i];
15520
16792
  }
15521
- const int32_t j = ctx->output_ids[i];
15522
16793
 
15523
16794
  if (j < 0) {
15524
16795
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15525
16796
  }
15526
- if ((size_t) j >= ctx->output_size) {
16797
+ if (j >= ctx->n_outputs) {
15527
16798
  // This should not happen
15528
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16799
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15529
16800
  }
15530
16801
 
15531
16802
  return ctx->logits + j*ctx->model.hparams.n_vocab;
@@ -15545,23 +16816,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
15545
16816
  }
15546
16817
 
15547
16818
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
16819
+ int32_t j = -1;
16820
+
15548
16821
  llama_synchronize(ctx);
15549
16822
 
15550
16823
  try {
15551
16824
  if (ctx->embd == nullptr) {
15552
16825
  throw std::runtime_error("no embeddings");
15553
16826
  }
15554
- if ((size_t) i >= ctx->output_ids.size()) {
16827
+
16828
+ if (i < 0) {
16829
+ j = ctx->n_outputs + i;
16830
+ if (j < 0) {
16831
+ throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
16832
+ }
16833
+ } else if ((size_t) i >= ctx->output_ids.size()) {
15555
16834
  throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
16835
+ } else {
16836
+ j = ctx->output_ids[i];
15556
16837
  }
15557
- const int32_t j = ctx->output_ids[i];
15558
16838
 
15559
16839
  if (j < 0) {
15560
16840
  throw std::runtime_error(format("batch.logits[%d] != true", i));
15561
16841
  }
15562
- if ((size_t) j >= ctx->output_size) {
16842
+ if (j >= ctx->n_outputs) {
15563
16843
  // This should not happen
15564
- throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
16844
+ throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
15565
16845
  }
15566
16846
 
15567
16847
  return ctx->embd + j*ctx->model.hparams.n_embd;
@@ -15608,6 +16888,14 @@ llama_token llama_token_eos(const struct llama_model * model) {
15608
16888
  return model->vocab.special_eos_id;
15609
16889
  }
15610
16890
 
16891
+ llama_token llama_token_cls(const struct llama_model * model) {
16892
+ return model->vocab.special_cls_id;
16893
+ }
16894
+
16895
+ llama_token llama_token_sep(const struct llama_model * model) {
16896
+ return model->vocab.special_sep_id;
16897
+ }
16898
+
15611
16899
  llama_token llama_token_nl(const struct llama_model * model) {
15612
16900
  return model->vocab.linefeed_id;
15613
16901
  }
@@ -15642,9 +16930,9 @@ int32_t llama_tokenize(
15642
16930
  int32_t text_len,
15643
16931
  llama_token * tokens,
15644
16932
  int32_t n_tokens_max,
15645
- bool add_bos,
15646
- bool special) {
15647
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
16933
+ bool add_special,
16934
+ bool parse_special) {
16935
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
15648
16936
 
15649
16937
  if (n_tokens_max < (int) res.size()) {
15650
16938
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -15910,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
15910
17198
  if (add_ass) {
15911
17199
  ss << "### Response:\n";
15912
17200
  }
17201
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17202
+ // CohereForAI/c4ai-command-r-plus
17203
+ for (auto message : chat) {
17204
+ std::string role(message->role);
17205
+ if (role == "system") {
17206
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17207
+ } else if (role == "user") {
17208
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17209
+ } else if (role == "assistant") {
17210
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17211
+ }
17212
+ }
17213
+ if (add_ass) {
17214
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
+ }
15913
17216
  } else {
15914
17217
  // template not supported
15915
17218
  return -1;