llama_cpp 0.14.5 → 0.14.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,7 +209,9 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
213
215
  LLM_ARCH_PLAMO,
214
216
  LLM_ARCH_CODESHELL,
215
217
  LLM_ARCH_ORION,
@@ -220,6 +222,8 @@ enum llm_arch {
220
222
  LLM_ARCH_MAMBA,
221
223
  LLM_ARCH_XVERSE,
222
224
  LLM_ARCH_COMMAND_R,
225
+ LLM_ARCH_DBRX,
226
+ LLM_ARCH_OLMO,
223
227
  LLM_ARCH_UNKNOWN,
224
228
  };
225
229
 
@@ -241,7 +245,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
245
  { LLM_ARCH_STABLELM, "stablelm" },
242
246
  { LLM_ARCH_QWEN, "qwen" },
243
247
  { LLM_ARCH_QWEN2, "qwen2" },
248
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
245
251
  { LLM_ARCH_PLAMO, "plamo" },
246
252
  { LLM_ARCH_CODESHELL, "codeshell" },
247
253
  { LLM_ARCH_ORION, "orion" },
@@ -252,6 +258,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
258
  { LLM_ARCH_MAMBA, "mamba" },
253
259
  { LLM_ARCH_XVERSE, "xverse" },
254
260
  { LLM_ARCH_COMMAND_R, "command-r" },
261
+ { LLM_ARCH_DBRX, "dbrx" },
262
+ { LLM_ARCH_OLMO, "olmo" },
255
263
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
264
  };
257
265
 
@@ -325,6 +333,10 @@ enum llm_kv {
325
333
  LLM_KV_TOKENIZER_ADD_PREFIX,
326
334
  LLM_KV_TOKENIZER_HF_JSON,
327
335
  LLM_KV_TOKENIZER_RWKV,
336
+ LLM_KV_TOKENIZER_PREFIX_ID,
337
+ LLM_KV_TOKENIZER_SUFFIX_ID,
338
+ LLM_KV_TOKENIZER_MIDDLE_ID,
339
+ LLM_KV_TOKENIZER_EOT_ID,
328
340
  };
329
341
 
330
342
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -397,6 +409,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
397
409
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
398
410
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
399
411
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
412
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
413
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
414
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
415
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
400
416
  };
401
417
 
402
418
  struct LLM_KV {
@@ -427,6 +443,7 @@ enum llm_tensor {
427
443
  LLM_TENSOR_ATTN_OUT_NORM,
428
444
  LLM_TENSOR_ATTN_ROT_EMBD,
429
445
  LLM_TENSOR_FFN_GATE_INP,
446
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
430
447
  LLM_TENSOR_FFN_NORM,
431
448
  LLM_TENSOR_FFN_GATE,
432
449
  LLM_TENSOR_FFN_DOWN,
@@ -438,6 +455,9 @@ enum llm_tensor {
438
455
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
439
456
  LLM_TENSOR_FFN_GATE_EXPS,
440
457
  LLM_TENSOR_FFN_UP_EXPS,
458
+ LLM_TENSOR_FFN_DOWN_SHEXP,
459
+ LLM_TENSOR_FFN_GATE_SHEXP,
460
+ LLM_TENSOR_FFN_UP_SHEXP,
441
461
  LLM_TENSOR_ATTN_Q_NORM,
442
462
  LLM_TENSOR_ATTN_K_NORM,
443
463
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -700,6 +720,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
700
720
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
701
721
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
702
722
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
723
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
724
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
703
725
  },
704
726
  },
705
727
  {
@@ -735,6 +757,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
735
757
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
736
758
  },
737
759
  },
760
+ {
761
+ LLM_ARCH_QWEN2MOE,
762
+ {
763
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
764
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
765
+ { LLM_TENSOR_OUTPUT, "output" },
766
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
767
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
768
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
769
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
770
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
771
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
772
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
773
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
774
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
775
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
776
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
777
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
778
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
779
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
780
+ },
781
+ },
738
782
  {
739
783
  LLM_ARCH_PHI2,
740
784
  {
@@ -751,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
751
795
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
752
796
  },
753
797
  },
798
+ {
799
+ LLM_ARCH_PHI3,
800
+ {
801
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
802
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
803
+ { LLM_TENSOR_OUTPUT, "output" },
804
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
805
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
806
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
807
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
808
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
809
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
810
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
811
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
812
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
813
+ },
814
+ },
754
815
  {
755
816
  LLM_ARCH_PLAMO,
756
817
  {
@@ -934,6 +995,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
934
995
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
935
996
  },
936
997
  },
998
+ {
999
+ LLM_ARCH_DBRX,
1000
+ {
1001
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1002
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1003
+ { LLM_TENSOR_OUTPUT, "output" },
1004
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1005
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1006
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1007
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
1008
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1009
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1010
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1011
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1012
+ },
1013
+ },
1014
+ {
1015
+ LLM_ARCH_OLMO,
1016
+ {
1017
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1018
+ { LLM_TENSOR_OUTPUT, "output" },
1019
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1020
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1021
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1022
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1023
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1024
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1025
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1026
+ },
1027
+ },
937
1028
  {
938
1029
  LLM_ARCH_UNKNOWN,
939
1030
  {
@@ -1528,12 +1619,12 @@ struct llama_mlock {
1528
1619
  };
1529
1620
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1530
1621
 
1531
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1622
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1532
1623
  std::vector<char> result(8, 0);
1533
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1624
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1534
1625
  if (n_tokens < 0) {
1535
1626
  result.resize(-n_tokens);
1536
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1627
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1537
1628
  GGML_ASSERT(check == -n_tokens);
1538
1629
  }
1539
1630
  else {
@@ -1690,6 +1781,7 @@ enum e_model {
1690
1781
  MODEL_4B,
1691
1782
  MODEL_7B,
1692
1783
  MODEL_8B,
1784
+ MODEL_12B,
1693
1785
  MODEL_13B,
1694
1786
  MODEL_14B,
1695
1787
  MODEL_15B,
@@ -1705,8 +1797,10 @@ enum e_model {
1705
1797
  MODEL_MEDIUM,
1706
1798
  MODEL_LARGE,
1707
1799
  MODEL_XL,
1800
+ MODEL_A2_7B,
1708
1801
  MODEL_8x7B,
1709
1802
  MODEL_8x22B,
1803
+ MODEL_16x12B,
1710
1804
  };
1711
1805
 
1712
1806
  static const size_t kiB = 1024;
@@ -1890,6 +1984,12 @@ struct llama_layer {
1890
1984
  struct ggml_tensor * ffn_down_exps;
1891
1985
  struct ggml_tensor * ffn_up_exps ;
1892
1986
 
1987
+ // ff shared expert (shexp)
1988
+ struct ggml_tensor * ffn_gate_inp_shexp;
1989
+ struct ggml_tensor * ffn_gate_shexp;
1990
+ struct ggml_tensor * ffn_down_shexp;
1991
+ struct ggml_tensor * ffn_up_shexp;
1992
+
1893
1993
  // ff bias
1894
1994
  struct ggml_tensor * ffn_down_b; // b2
1895
1995
  struct ggml_tensor * ffn_up_b; // b3
@@ -2036,10 +2136,10 @@ struct llama_vocab {
2036
2136
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2037
2137
 
2038
2138
  id linefeed_id = 13;
2039
- id special_prefix_id = 32007;
2040
- id special_middle_id = 32009;
2041
- id special_suffix_id = 32008;
2042
- id special_eot_id = 32010;
2139
+ id special_prefix_id = -1;
2140
+ id special_suffix_id = -1;
2141
+ id special_middle_id = -1;
2142
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2043
2143
 
2044
2144
  bool add_space_prefix = true;
2045
2145
 
@@ -2899,9 +2999,13 @@ struct llama_model_loader {
2899
2999
 
2900
3000
  ggml_tensor * tensor;
2901
3001
 
2902
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3002
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2903
3003
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2904
3004
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3005
+
3006
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3007
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3008
+ }
2905
3009
  }
2906
3010
  };
2907
3011
  std::vector<llama_tensor_weight> weights;
@@ -2940,15 +3044,15 @@ struct llama_model_loader {
2940
3044
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2941
3045
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2942
3046
 
3047
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3048
+ contexts.emplace_back(ctx);
3049
+
2943
3050
  // Save tensors data offset of the main file.
2944
3051
  // For subsidiary files, `meta` tensor data offset must not be used,
2945
3052
  // so we build a unified tensors index for weights.
2946
3053
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2947
- weights.emplace_back(0, cur->name, meta, cur);
3054
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
2948
3055
  }
2949
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
2950
- contexts.emplace_back(ctx);
2951
-
2952
3056
  uint16_t n_split = 0;
2953
3057
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2954
3058
 
@@ -2982,12 +3086,13 @@ struct llama_model_loader {
2982
3086
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2983
3087
  }
2984
3088
 
3089
+ files.emplace_back(new llama_file(split_path, "rb"));
3090
+ contexts.emplace_back(ctx);
3091
+
2985
3092
  // Save tensors data offset info of the shard.
2986
3093
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2987
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3094
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
2988
3095
  }
2989
- files.emplace_back(new llama_file(split_path, "rb"));
2990
- contexts.emplace_back(ctx);
2991
3096
 
2992
3097
  gguf_free(ctx_gguf);
2993
3098
  }
@@ -3197,6 +3302,10 @@ struct llama_model_loader {
3197
3302
  return nullptr;
3198
3303
  }
3199
3304
 
3305
+ const llama_tensor_weight * get_weight(int i) const {
3306
+ return get_weight(get_tensor_name(i));
3307
+ }
3308
+
3200
3309
  const llama_tensor_weight & require_weight(const char * name) const {
3201
3310
  const llama_tensor_weight * weight = get_weight(name);
3202
3311
  if (!weight) {
@@ -3545,6 +3654,7 @@ static const char * llama_model_type_name(e_model type) {
3545
3654
  case MODEL_3B: return "3B";
3546
3655
  case MODEL_7B: return "7B";
3547
3656
  case MODEL_8B: return "8B";
3657
+ case MODEL_12B: return "12B";
3548
3658
  case MODEL_13B: return "13B";
3549
3659
  case MODEL_14B: return "14B";
3550
3660
  case MODEL_15B: return "15B";
@@ -3560,8 +3670,10 @@ static const char * llama_model_type_name(e_model type) {
3560
3670
  case MODEL_MEDIUM: return "0.4B";
3561
3671
  case MODEL_LARGE: return "0.8B";
3562
3672
  case MODEL_XL: return "1.5B";
3673
+ case MODEL_A2_7B: return "A2.7B";
3563
3674
  case MODEL_8x7B: return "8x7B";
3564
3675
  case MODEL_8x22B: return "8x22B";
3676
+ case MODEL_16x12B: return "16x12B";
3565
3677
  default: return "?B";
3566
3678
  }
3567
3679
  }
@@ -3686,7 +3798,7 @@ static void llm_load_hparams(
3686
3798
  switch (hparams.n_layer) {
3687
3799
  case 22: model.type = e_model::MODEL_1B; break;
3688
3800
  case 26: model.type = e_model::MODEL_3B; break;
3689
- case 32: model.type = e_model::MODEL_7B; break;
3801
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3690
3802
  case 40: model.type = e_model::MODEL_13B; break;
3691
3803
  case 48: model.type = e_model::MODEL_34B; break;
3692
3804
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3834,6 +3946,7 @@ static void llm_load_hparams(
3834
3946
  switch (hparams.n_layer) {
3835
3947
  case 24: model.type = e_model::MODEL_1B; break;
3836
3948
  case 32: model.type = e_model::MODEL_3B; break;
3949
+ case 40: model.type = e_model::MODEL_12B; break;
3837
3950
  default: model.type = e_model::MODEL_UNKNOWN;
3838
3951
  }
3839
3952
  } break;
@@ -3858,10 +3971,28 @@ static void llm_load_hparams(
3858
3971
  default: model.type = e_model::MODEL_UNKNOWN;
3859
3972
  }
3860
3973
  } break;
3974
+ case LLM_ARCH_QWEN2MOE:
3975
+ {
3976
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3977
+ switch (hparams.n_layer) {
3978
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3979
+ default: model.type = e_model::MODEL_UNKNOWN;
3980
+ }
3981
+ } break;
3861
3982
  case LLM_ARCH_PHI2:
3862
3983
  {
3863
3984
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3864
3985
 
3986
+ switch (hparams.n_layer) {
3987
+ case 24: model.type = e_model::MODEL_1B; break;
3988
+ case 32: model.type = e_model::MODEL_3B; break;
3989
+ default: model.type = e_model::MODEL_UNKNOWN;
3990
+ }
3991
+ } break;
3992
+ case LLM_ARCH_PHI3:
3993
+ {
3994
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3995
+
3865
3996
  switch (hparams.n_layer) {
3866
3997
  case 24: model.type = e_model::MODEL_1B; break;
3867
3998
  case 32: model.type = e_model::MODEL_3B; break;
@@ -3983,6 +4114,28 @@ static void llm_load_hparams(
3983
4114
  default: model.type = e_model::MODEL_UNKNOWN;
3984
4115
  }
3985
4116
  } break;
4117
+ case LLM_ARCH_DBRX:
4118
+ {
4119
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4120
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4121
+
4122
+ switch (hparams.n_layer) {
4123
+ case 40: model.type = e_model::MODEL_16x12B; break;
4124
+ default: model.type = e_model::MODEL_UNKNOWN;
4125
+ }
4126
+ } break;
4127
+ case LLM_ARCH_OLMO:
4128
+ {
4129
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4130
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4131
+
4132
+ switch (hparams.n_layer) {
4133
+ case 22: model.type = e_model::MODEL_1B; break;
4134
+ case 32: model.type = e_model::MODEL_7B; break;
4135
+ case 80: model.type = e_model::MODEL_70B; break;
4136
+ default: model.type = e_model::MODEL_UNKNOWN;
4137
+ }
4138
+ } break;
3986
4139
  default: (void)0;
3987
4140
  }
3988
4141
 
@@ -4042,6 +4195,35 @@ static void llm_load_vocab(
4042
4195
  vocab.special_cls_id = -1;
4043
4196
  vocab.special_mask_id = -1;
4044
4197
 
4198
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4199
+ // prior to support of FIM special tokens in GGUF, the following
4200
+ // will allow those models to continue to work. The general names
4201
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4202
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4203
+ // new versions of these models have been published.
4204
+ std::string gen_name;
4205
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4206
+
4207
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4208
+ [](unsigned char c){ return std::tolower(c); });
4209
+
4210
+ if (gen_name.find("code") != std::string::npos) {
4211
+ if (model.arch == LLM_ARCH_LLAMA) {
4212
+ vocab.special_prefix_id = 32007;
4213
+ vocab.special_suffix_id = 32008;
4214
+ vocab.special_middle_id = 32009;
4215
+ vocab.special_eot_id = 32010;
4216
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4217
+ vocab.special_prefix_id = 67;
4218
+ vocab.special_suffix_id = 69;
4219
+ vocab.special_middle_id = 68;
4220
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4221
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4222
+ //vocab.special_eot_id = 70;
4223
+ vocab.special_eot_id = 107;
4224
+ }
4225
+ }
4226
+
4045
4227
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4046
4228
  if (add_space_prefix_keyidx != -1) {
4047
4229
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4155,14 +4337,19 @@ static void llm_load_vocab(
4155
4337
  // special tokens
4156
4338
  {
4157
4339
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4158
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4340
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4341
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4342
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4343
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4344
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4345
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4346
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4347
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4348
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4349
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4350
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4165
4351
  };
4352
+
4166
4353
  for (const auto & it : special_token_types) {
4167
4354
  const std::string & key = kv(std::get<0>(it));
4168
4355
  int32_t & id = std::get<1>(it);
@@ -4177,7 +4364,6 @@ static void llm_load_vocab(
4177
4364
  } else {
4178
4365
  id = new_id;
4179
4366
  }
4180
-
4181
4367
  }
4182
4368
 
4183
4369
  // Handle add_bos_token and add_eos_token
@@ -4191,6 +4377,28 @@ static void llm_load_vocab(
4191
4377
  vocab.special_add_eos = int(temp);
4192
4378
  }
4193
4379
  }
4380
+
4381
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4382
+ //
4383
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4384
+ // for now, we apply this workaround to find the EOT token based on its text
4385
+ if (vocab.special_eot_id == -1) {
4386
+ for (const auto & t : vocab.token_to_id) {
4387
+ if (
4388
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4389
+ // need to fix convert script
4390
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4391
+ (t.first == "<|eot_id|>" ||
4392
+ t.first == "<|im_end|>" ||
4393
+ t.first == "<|end|>" ||
4394
+ t.first == "<end_of_turn>"
4395
+ )
4396
+ ) {
4397
+ vocab.special_eot_id = t.second;
4398
+ break;
4399
+ }
4400
+ }
4401
+ }
4194
4402
  }
4195
4403
 
4196
4404
  // build special tokens cache
@@ -4353,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4353
4561
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4354
4562
 
4355
4563
  // special tokens
4356
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4357
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4358
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4359
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4360
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4361
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4362
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4363
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4564
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4565
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4566
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4567
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4568
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4569
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4570
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4571
+
4572
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4573
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4574
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4575
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4576
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4364
4577
  }
4365
4578
 
4366
4579
  // Returns false if cancelled by progress_callback
@@ -4378,6 +4591,13 @@ static bool llm_load_tensors(
4378
4591
 
4379
4592
  auto & hparams = model.hparams;
4380
4593
 
4594
+ #ifdef GGML_USE_SYCL
4595
+ // disable MoE with SYCL until mul_mat_id is updated
4596
+ if (hparams.n_expert > 0) {
4597
+ n_gpu_layers = 0;
4598
+ }
4599
+ #endif
4600
+
4381
4601
  model.split_mode = split_mode;
4382
4602
  model.main_gpu = main_gpu;
4383
4603
  model.n_gpu_layers = n_gpu_layers;
@@ -4475,7 +4695,7 @@ static bool llm_load_tensors(
4475
4695
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4476
4696
 
4477
4697
  // for moe merged tensors
4478
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4698
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4479
4699
 
4480
4700
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4481
4701
  for (auto & it : buft_layer_count) {
@@ -4671,6 +4891,39 @@ static bool llm_load_tensors(
4671
4891
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4672
4892
  }
4673
4893
  } break;
4894
+ case LLM_ARCH_DBRX:
4895
+ {
4896
+ if (n_expert == 0) {
4897
+ throw std::runtime_error("DBRX model cannot have zero experts");
4898
+ }
4899
+
4900
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4901
+
4902
+ // output
4903
+ {
4904
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4905
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4906
+ }
4907
+
4908
+ for (int i = 0; i < n_layer; ++i) {
4909
+ ggml_context * ctx_layer = ctx_for_layer(i);
4910
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4911
+
4912
+ auto & layer = model.layers[i];
4913
+
4914
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4915
+
4916
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4917
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4918
+
4919
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4920
+
4921
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4922
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4923
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4924
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4925
+ }
4926
+ } break;
4674
4927
  case LLM_ARCH_BAICHUAN:
4675
4928
  {
4676
4929
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4985,8 +5238,13 @@ static bool llm_load_tensors(
4985
5238
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4986
5239
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4987
5240
 
4988
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4989
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5241
+ // optional q and k layernorms, present in StableLM 2 12B
5242
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5243
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5244
+
5245
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5246
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5247
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4990
5248
 
4991
5249
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4992
5250
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5029,7 +5287,13 @@ static bool llm_load_tensors(
5029
5287
  // output
5030
5288
  {
5031
5289
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5032
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5290
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5291
+ // if output is NULL, init from the input tok embed
5292
+ if (model.output == NULL) {
5293
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5294
+ ml.n_created--; // artificial tensor
5295
+ ml.size_data += ggml_nbytes(model.output);
5296
+ }
5033
5297
  }
5034
5298
 
5035
5299
  for (int i = 0; i < n_layer; ++i) {
@@ -5057,6 +5321,54 @@ static bool llm_load_tensors(
5057
5321
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5058
5322
  }
5059
5323
  } break;
5324
+ case LLM_ARCH_QWEN2MOE:
5325
+ {
5326
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5327
+
5328
+ // output
5329
+ {
5330
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5331
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5332
+ }
5333
+
5334
+ for (int i = 0; i < n_layer; ++i) {
5335
+ ggml_context * ctx_layer = ctx_for_layer(i);
5336
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5337
+
5338
+ auto & layer = model.layers[i];
5339
+
5340
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5341
+
5342
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5343
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5344
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5345
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5346
+
5347
+ // optional bias tensors
5348
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5349
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5350
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5351
+
5352
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5353
+
5354
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5355
+
5356
+ GGML_ASSERT(hparams.n_expert > 0);
5357
+ GGML_ASSERT(hparams.n_expert_used > 0);
5358
+
5359
+ // MoE branch
5360
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5361
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5362
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5363
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5364
+
5365
+ // Shared expert branch
5366
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5367
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5368
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5369
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5370
+ }
5371
+ } break;
5060
5372
  case LLM_ARCH_PHI2:
5061
5373
  {
5062
5374
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5102,6 +5414,33 @@ static bool llm_load_tensors(
5102
5414
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5103
5415
  }
5104
5416
  } break;
5417
+ case LLM_ARCH_PHI3:
5418
+ {
5419
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5420
+
5421
+ // output
5422
+ {
5423
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5424
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5425
+ }
5426
+
5427
+ for (int i = 0; i < n_layer; ++i) {
5428
+ ggml_context* ctx_layer = ctx_for_layer(i);
5429
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5430
+
5431
+ auto& layer = model.layers[i];
5432
+
5433
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5434
+
5435
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5436
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5437
+
5438
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5439
+
5440
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5441
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5442
+ }
5443
+ } break;
5105
5444
  case LLM_ARCH_PLAMO:
5106
5445
  {
5107
5446
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5450,6 +5789,37 @@ static bool llm_load_tensors(
5450
5789
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5451
5790
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5452
5791
 
5792
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5793
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5794
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5795
+ }
5796
+ } break;
5797
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5798
+ {
5799
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5800
+
5801
+ // output
5802
+ {
5803
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5804
+ // if output is NULL, init from the input tok embed
5805
+ if (model.output == NULL) {
5806
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5807
+ ml.n_created--; // artificial tensor
5808
+ ml.size_data += ggml_nbytes(model.output);
5809
+ }
5810
+ }
5811
+
5812
+ for (int i = 0; i < n_layer; ++i) {
5813
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5814
+
5815
+ auto & layer = model.layers[i];
5816
+
5817
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5818
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5819
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5820
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5821
+
5822
+
5453
5823
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5454
5824
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5455
5825
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5890,6 +6260,100 @@ static struct ggml_tensor * llm_build_ffn(
5890
6260
  return cur;
5891
6261
  }
5892
6262
 
6263
+ static struct ggml_tensor * llm_build_moe_ffn(
6264
+ struct ggml_context * ctx,
6265
+ struct ggml_tensor * cur,
6266
+ struct ggml_tensor * gate_inp,
6267
+ struct ggml_tensor * up_exps,
6268
+ struct ggml_tensor * gate_exps,
6269
+ struct ggml_tensor * down_exps,
6270
+ int64_t n_expert,
6271
+ int64_t n_expert_used,
6272
+ llm_ffn_op_type type_op,
6273
+ bool norm_w,
6274
+ const llm_build_cb & cb,
6275
+ int il) {
6276
+ int64_t n_embd = cur->ne[0];
6277
+ int64_t n_tokens = cur->ne[1];
6278
+
6279
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6280
+ cb(logits, "ffn_moe_logits", il);
6281
+
6282
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6283
+ cb(probs, "ffn_moe_probs", il);
6284
+
6285
+ // select experts
6286
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6287
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6288
+ cb(selected_experts, "ffn_moe_topk", il);
6289
+
6290
+ ggml_tensor * weights = ggml_get_rows(ctx,
6291
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6292
+ cb(weights, "ffn_moe_weights", il);
6293
+
6294
+ if (norm_w) {
6295
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6296
+
6297
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6298
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6299
+
6300
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6301
+ cb(weights, "ffn_moe_weights_norm", il);
6302
+
6303
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6304
+ }
6305
+
6306
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6307
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6308
+ cb(up, "ffn_moe_up", il);
6309
+
6310
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6311
+ cb(gate, "ffn_moe_gate", il);
6312
+
6313
+ switch (type_op) {
6314
+ case LLM_FFN_SILU:
6315
+ {
6316
+ gate = ggml_silu(ctx, gate);
6317
+ cb(gate, "ffn_moe_silu", il);
6318
+ } break;
6319
+ case LLM_FFN_GELU:
6320
+ {
6321
+ gate = ggml_gelu(ctx, gate);
6322
+ cb(gate, "ffn_moe_gelu", il);
6323
+ } break;
6324
+ default:
6325
+ GGML_ASSERT(false);
6326
+ }
6327
+
6328
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6329
+ cb(par, "ffn_moe_gate_par", il);
6330
+
6331
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6332
+ cb(experts, "ffn_moe_down", il);
6333
+
6334
+ experts = ggml_mul(ctx, experts, weights);
6335
+
6336
+ // aggregate experts
6337
+ ggml_tensor * moe_out = nullptr;
6338
+ for (int i = 0; i < n_expert_used; ++i) {
6339
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6340
+ experts->nb[2], i*experts->nb[1]);
6341
+
6342
+ if (i == 0) {
6343
+ moe_out = cur_expert;
6344
+ } else {
6345
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6346
+ }
6347
+ }
6348
+
6349
+ if (n_expert_used == 1) {
6350
+ // avoid returning a non-contiguous tensor
6351
+ moe_out = ggml_cont(ctx, moe_out);
6352
+ }
6353
+
6354
+ return moe_out;
6355
+ }
6356
+
5893
6357
  // if max_alibi_bias > 0 then apply ALiBi
5894
6358
  static struct ggml_tensor * llm_build_kqv(
5895
6359
  struct ggml_context * ctx,
@@ -5928,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
5928
6392
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
5929
6393
  cb(kq, "kq", il);
5930
6394
 
5931
- if (model.arch == LLM_ARCH_PHI2) {
6395
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
5932
6396
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
5933
6397
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
5934
6398
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6433,63 +6897,16 @@ struct llm_build_context {
6433
6897
  LLM_NORM_RMS, cb, il);
6434
6898
  cb(cur, "ffn_norm", il);
6435
6899
 
6436
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6437
- cb(logits, "ffn_moe_logits", il);
6438
-
6439
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6440
- cb(probs, "ffn_moe_probs", il);
6441
-
6442
- // select experts
6443
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6444
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6445
-
6446
- ggml_tensor * weights = ggml_get_rows(ctx0,
6447
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6448
- cb(weights, "ffn_moe_weights", il);
6449
-
6450
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6451
-
6452
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6453
- cb(weights_sum, "ffn_moe_weights_sum", il);
6454
-
6455
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6456
- cb(weights, "ffn_moe_weights_norm", il);
6457
-
6458
- // compute expert outputs
6459
- ggml_tensor * moe_out = nullptr;
6460
-
6461
- for (int i = 0; i < n_expert_used; ++i) {
6462
- ggml_tensor * cur_expert;
6463
-
6464
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6465
- cb(cur_up, "ffn_moe_up", il);
6466
-
6467
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6468
- cb(cur_gate, "ffn_moe_gate", il);
6469
-
6470
- cur_gate = ggml_silu(ctx0, cur_gate);
6471
- cb(cur_gate, "ffn_moe_silu", il);
6472
-
6473
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6474
- cb(cur_expert, "ffn_moe_gate_par", il);
6475
-
6476
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6477
- cb(cur_expert, "ffn_moe_down", il);
6478
-
6479
- cur_expert = ggml_mul(ctx0, cur_expert,
6480
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6481
- cb(cur_expert, "ffn_moe_weighted", il);
6482
-
6483
- if (i == 0) {
6484
- moe_out = cur_expert;
6485
- } else {
6486
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6487
- cb(moe_out, "ffn_moe_out", il);
6488
- }
6489
- }
6490
-
6491
- cur = moe_out;
6492
- }
6900
+ cur = llm_build_moe_ffn(ctx0, cur,
6901
+ model.layers[il].ffn_gate_inp,
6902
+ model.layers[il].ffn_up_exps,
6903
+ model.layers[il].ffn_gate_exps,
6904
+ model.layers[il].ffn_down_exps,
6905
+ n_expert, n_expert_used,
6906
+ LLM_FFN_SILU, true,
6907
+ cb, il);
6908
+ cb(cur, "ffn_moe_out", il);
6909
+ }
6493
6910
 
6494
6911
  cur = ggml_add(ctx0, cur, ffn_inp);
6495
6912
  cb(cur, "ffn_out", il);
@@ -6967,74 +7384,158 @@ struct llm_build_context {
6967
7384
  LLM_NORM_RMS, cb, il);
6968
7385
  cb(cur, "ffn_norm", il);
6969
7386
 
6970
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6971
- cb(logits, "ffn_moe_logits", il);
7387
+ cur = llm_build_moe_ffn(ctx0, cur,
7388
+ model.layers[il].ffn_gate_inp,
7389
+ model.layers[il].ffn_up_exps,
7390
+ model.layers[il].ffn_gate_exps,
7391
+ model.layers[il].ffn_down_exps,
7392
+ n_expert, n_expert_used,
7393
+ LLM_FFN_GELU, true,
7394
+ cb, il);
7395
+ cb(cur, "ffn_moe_out", il);
7396
+
7397
+ // Grok
7398
+ // if layer_out_norm is present then apply it before adding the input
7399
+ // Idea: maybe ffn_out_norm is a better name
7400
+ if (model.layers[il].layer_out_norm) {
7401
+ cur = llm_build_norm(ctx0, cur, hparams,
7402
+ model.layers[il].layer_out_norm, NULL,
7403
+ LLM_NORM_RMS, cb, il);
7404
+ cb(cur, "layer_out_norm", il);
7405
+ }
7406
+
7407
+ cur = ggml_add(ctx0, cur, ffn_inp);
7408
+ cb(cur, "ffn_out", il);
7409
+
7410
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7411
+ if (layer_dir != nullptr) {
7412
+ cur = ggml_add(ctx0, cur, layer_dir);
7413
+ }
7414
+ cb(cur, "l_out", il);
7415
+
7416
+ // input for next layer
7417
+ inpL = cur;
7418
+ }
7419
+
7420
+ cur = inpL;
7421
+
7422
+ cur = llm_build_norm(ctx0, cur, hparams,
7423
+ model.output_norm, NULL,
7424
+ LLM_NORM_RMS, cb, -1);
7425
+ cb(cur, "result_norm", -1);
7426
+
7427
+ // lm_head
7428
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7429
+
7430
+ // Grok
7431
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
6972
7432
 
6973
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6974
- cb(probs, "ffn_moe_probs", il);
7433
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
6975
7434
 
6976
- // select experts
6977
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6978
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
7435
+ cb(cur, "result_output", -1);
6979
7436
 
6980
- ggml_tensor * weights = ggml_get_rows(ctx0,
6981
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6982
- cb(weights, "ffn_moe_weights", il);
7437
+ ggml_build_forward_expand(gf, cur);
6983
7438
 
6984
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
7439
+ return gf;
7440
+ }
7441
+
7442
+ struct ggml_cgraph * build_dbrx() {
7443
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6985
7444
 
6986
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6987
- cb(weights_sum, "ffn_moe_weights_sum", il);
7445
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7446
+ int32_t n_tokens = this->n_tokens;
6988
7447
 
6989
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6990
- cb(weights, "ffn_moe_weights_norm", il);
7448
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7449
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7450
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7451
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6991
7452
 
6992
- // compute expert outputs
6993
- ggml_tensor * moe_out = nullptr;
7453
+ struct ggml_tensor * cur;
7454
+ struct ggml_tensor * inpL;
6994
7455
 
6995
- for (int i = 0; i < n_expert_used; ++i) {
6996
- ggml_tensor * cur_expert;
7456
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6997
7457
 
6998
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6999
- cb(cur_up, "ffn_moe_up", il);
7458
+ // inp_pos - contains the positions
7459
+ struct ggml_tensor * inp_pos = build_inp_pos();
7000
7460
 
7001
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
7002
- cb(cur_gate, "ffn_moe_gate", il);
7461
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7462
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7003
7463
 
7004
- //GeLU
7005
- cur_gate = ggml_gelu(ctx0, cur_gate);
7006
- cb(cur_gate, "ffn_moe_gelu", il);
7464
+ for (int il = 0; il < n_layer; ++il) {
7465
+ struct ggml_tensor * inpSA = inpL;
7007
7466
 
7008
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
7009
- cb(cur_expert, "ffn_moe_gate_par", il);
7467
+ // norm
7468
+ cur = llm_build_norm(ctx0, inpL, hparams,
7469
+ model.layers[il].attn_norm, NULL,
7470
+ LLM_NORM, cb, il);
7471
+ cb(cur, "attn_norm", il);
7010
7472
 
7011
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
7012
- cb(cur_expert, "ffn_moe_down", il);
7473
+ // self-attention
7474
+ {
7475
+ struct ggml_tensor * Qcur = nullptr;
7476
+ struct ggml_tensor * Kcur = nullptr;
7477
+ struct ggml_tensor * Vcur = nullptr;
7013
7478
 
7014
- cur_expert = ggml_mul(ctx0, cur_expert,
7015
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
7016
- cb(cur_expert, "ffn_moe_weighted", il);
7479
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7480
+ cb(cur, "wqkv", il);
7017
7481
 
7018
- if (i == 0) {
7019
- moe_out = cur_expert;
7020
- } else {
7021
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
7022
- cb(moe_out, "ffn_moe_out", il);
7023
- }
7024
- }
7482
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7483
+ cb(cur, "wqkv_clamped", il);
7025
7484
 
7026
- cur = moe_out;
7485
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7486
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7487
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7027
7488
 
7028
- // Grok
7029
- // if layer_out_norm is present then apply it before adding the input
7030
- // Idea: maybe ffn_out_norm is a better name
7031
- if (model.layers[il].layer_out_norm) {
7032
- cur = llm_build_norm(ctx0, cur, hparams,
7033
- model.layers[il].layer_out_norm, NULL,
7034
- LLM_NORM_RMS, cb, il);
7035
- cb(cur, "layer_out_norm", il);
7489
+ cb(Qcur, "Qcur", il);
7490
+ cb(Kcur, "Kcur", il);
7491
+ cb(Vcur, "Vcur", il);
7492
+
7493
+ Qcur = ggml_rope_custom(
7494
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7495
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7496
+ ext_factor, attn_factor, beta_fast, beta_slow
7497
+ );
7498
+ cb(Qcur, "Qcur", il);
7499
+
7500
+ Kcur = ggml_rope_custom(
7501
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7502
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7503
+ ext_factor, attn_factor, beta_fast, beta_slow
7504
+ );
7505
+ cb(Kcur, "Kcur", il);
7506
+
7507
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
+ model.layers[il].wo, NULL,
7509
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7510
  }
7037
7511
 
7512
+ if (il == n_layer - 1) {
7513
+ // skip computing output for unused tokens
7514
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7515
+ n_tokens = n_outputs;
7516
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7517
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7518
+ }
7519
+
7520
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7521
+ cb(ffn_inp, "ffn_inp", il);
7522
+
7523
+ // feed-forward network
7524
+ // MoE branch
7525
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7526
+ model.layers[il].attn_out_norm, NULL,
7527
+ LLM_NORM, cb, il);
7528
+ cb(cur, "attn_out_norm", il);
7529
+
7530
+ cur = llm_build_moe_ffn(ctx0, cur,
7531
+ model.layers[il].ffn_gate_inp,
7532
+ model.layers[il].ffn_up_exps,
7533
+ model.layers[il].ffn_gate_exps,
7534
+ model.layers[il].ffn_down_exps,
7535
+ n_expert, n_expert_used,
7536
+ LLM_FFN_SILU, true,
7537
+ cb, il);
7538
+ cb(cur, "ffn_moe_out", il);
7038
7539
 
7039
7540
  cur = ggml_add(ctx0, cur, ffn_inp);
7040
7541
  cb(cur, "ffn_out", il);
@@ -7052,18 +7553,13 @@ struct llm_build_context {
7052
7553
  cur = inpL;
7053
7554
 
7054
7555
  cur = llm_build_norm(ctx0, cur, hparams,
7055
- model.output_norm, NULL,
7056
- LLM_NORM_RMS, cb, -1);
7556
+ model.output_norm, NULL,
7557
+ LLM_NORM, cb, -1);
7057
7558
  cb(cur, "result_norm", -1);
7058
7559
 
7059
7560
  // lm_head
7060
7561
  cur = ggml_mul_mat(ctx0, model.output, cur);
7061
7562
 
7062
- // Grok
7063
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7064
-
7065
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7066
-
7067
7563
  cb(cur, "result_output", -1);
7068
7564
 
7069
7565
  ggml_build_forward_expand(gf, cur);
@@ -7923,7 +8419,7 @@ struct llm_build_context {
7923
8419
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7924
8420
 
7925
8421
  for (int il = 0; il < n_layer; ++il) {
7926
- struct ggml_tensor * inpSA = inpL;
8422
+
7927
8423
 
7928
8424
  // norm
7929
8425
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7932,6 +8428,8 @@ struct llm_build_context {
7932
8428
  LLM_NORM, cb, il);
7933
8429
  cb(cur, "attn_norm", il);
7934
8430
 
8431
+ struct ggml_tensor * inpSA = cur;
8432
+
7935
8433
  // self-attention
7936
8434
  {
7937
8435
  // compute Q and K and RoPE them
@@ -7956,15 +8454,36 @@ struct llm_build_context {
7956
8454
  cb(Vcur, "Vcur", il);
7957
8455
  }
7958
8456
 
8457
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8458
+ cb(Qcur, "Qcur", il);
8459
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8460
+ cb(Kcur, "Kcur", il);
8461
+
8462
+ if (model.layers[il].attn_q_norm) {
8463
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8464
+ model.layers[il].attn_q_norm,
8465
+ NULL,
8466
+ LLM_NORM, cb, il);
8467
+ cb(Qcur, "Qcur", il);
8468
+ }
8469
+ if (model.layers[il].attn_k_norm) {
8470
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8471
+ model.layers[il].attn_k_norm,
8472
+ NULL,
8473
+ LLM_NORM, cb, il);
8474
+ cb(Kcur, "Kcur", il);
8475
+ }
8476
+
8477
+
7959
8478
  Qcur = ggml_rope_custom(
7960
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8479
+ ctx0, Qcur, inp_pos,
7961
8480
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7962
8481
  ext_factor, attn_factor, beta_fast, beta_slow
7963
8482
  );
7964
8483
  cb(Qcur, "Qcur", il);
7965
8484
 
7966
8485
  Kcur = ggml_rope_custom(
7967
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8486
+ ctx0, Kcur, inp_pos,
7968
8487
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7969
8488
  ext_factor, attn_factor, beta_fast, beta_slow
7970
8489
  );
@@ -7979,20 +8498,25 @@ struct llm_build_context {
7979
8498
  // skip computing output for unused tokens
7980
8499
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7981
8500
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8501
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7982
8502
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7983
8503
  }
7984
8504
 
7985
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8505
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7986
8506
  cb(ffn_inp, "ffn_inp", il);
7987
8507
 
7988
8508
  // feed-forward network
7989
8509
  {
7990
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7991
- model.layers[il].ffn_norm,
7992
- model.layers[il].ffn_norm_b,
7993
- LLM_NORM, cb, il);
7994
- cb(cur, "ffn_norm", il);
7995
-
8510
+ if (model.layers[il].ffn_norm) {
8511
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8512
+ model.layers[il].ffn_norm,
8513
+ model.layers[il].ffn_norm_b,
8514
+ LLM_NORM, cb, il);
8515
+ cb(cur, "ffn_norm", il);
8516
+ } else {
8517
+ // parallel residual
8518
+ cur = inpSA;
8519
+ }
7996
8520
  cur = llm_build_ffn(ctx0, cur,
7997
8521
  model.layers[il].ffn_up, NULL,
7998
8522
  model.layers[il].ffn_gate, NULL,
@@ -8182,12 +8706,6 @@ struct llm_build_context {
8182
8706
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8183
8707
  cb(Vcur, "Vcur", il);
8184
8708
 
8185
- // these nodes are added to the graph together so that they are not reordered
8186
- // by doing so, the number of splits in the graph is reduced
8187
- ggml_build_forward_expand(gf, Qcur);
8188
- ggml_build_forward_expand(gf, Kcur);
8189
- ggml_build_forward_expand(gf, Vcur);
8190
-
8191
8709
  Qcur = ggml_rope_custom(
8192
8710
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8193
8711
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8245,25 +8763,288 @@ struct llm_build_context {
8245
8763
  LLM_NORM_RMS, cb, -1);
8246
8764
  cb(cur, "result_norm", -1);
8247
8765
 
8248
- // lm_head
8766
+ // lm_head
8767
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8768
+ cb(cur, "result_output", -1);
8769
+
8770
+ ggml_build_forward_expand(gf, cur);
8771
+
8772
+ return gf;
8773
+ }
8774
+
8775
+ struct ggml_cgraph * build_qwen2moe() {
8776
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8777
+
8778
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8779
+ int32_t n_tokens = this->n_tokens;
8780
+
8781
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8782
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8783
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8784
+
8785
+ struct ggml_tensor * cur;
8786
+ struct ggml_tensor * inpL;
8787
+
8788
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8789
+
8790
+ // inp_pos - contains the positions
8791
+ struct ggml_tensor * inp_pos = build_inp_pos();
8792
+
8793
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8794
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8795
+
8796
+ for (int il = 0; il < n_layer; ++il) {
8797
+ struct ggml_tensor * inpSA = inpL;
8798
+
8799
+ // norm
8800
+ cur = llm_build_norm(ctx0, inpL, hparams,
8801
+ model.layers[il].attn_norm, NULL,
8802
+ LLM_NORM_RMS, cb, il);
8803
+ cb(cur, "attn_norm", il);
8804
+
8805
+ // self_attention
8806
+ {
8807
+ // compute Q and K and RoPE them
8808
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8809
+ cb(Qcur, "Qcur", il);
8810
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8811
+ cb(Qcur, "Qcur", il);
8812
+
8813
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8814
+ cb(Kcur, "Kcur", il);
8815
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8816
+ cb(Kcur, "Kcur", il);
8817
+
8818
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8819
+ cb(Vcur, "Vcur", il);
8820
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8821
+ cb(Vcur, "Vcur", il);
8822
+
8823
+ Qcur = ggml_rope_custom(
8824
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8825
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8826
+ ext_factor, attn_factor, beta_fast, beta_slow
8827
+ );
8828
+ cb(Qcur, "Qcur", il);
8829
+
8830
+ Kcur = ggml_rope_custom(
8831
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8832
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8833
+ ext_factor, attn_factor, beta_fast, beta_slow
8834
+ );
8835
+ cb(Kcur, "Kcur", il);
8836
+
8837
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8838
+ model.layers[il].wo, model.layers[il].bo,
8839
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
+ }
8841
+
8842
+ if (il == n_layer - 1) {
8843
+ // skip computing output for unused tokens
8844
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8845
+ n_tokens = n_outputs;
8846
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8847
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8848
+ }
8849
+
8850
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8851
+ cb(ffn_inp, "ffn_inp", il);
8852
+
8853
+ // MoE branch
8854
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8855
+ model.layers[il].ffn_norm, NULL,
8856
+ LLM_NORM_RMS, cb, il);
8857
+ cb(cur, "ffn_norm", il);
8858
+
8859
+ ggml_tensor * moe_out =
8860
+ llm_build_moe_ffn(ctx0, cur,
8861
+ model.layers[il].ffn_gate_inp,
8862
+ model.layers[il].ffn_up_exps,
8863
+ model.layers[il].ffn_gate_exps,
8864
+ model.layers[il].ffn_down_exps,
8865
+ n_expert, n_expert_used,
8866
+ LLM_FFN_SILU, false,
8867
+ cb, il);
8868
+ cb(cur, "ffn_moe_out", il);
8869
+
8870
+ // FFN shared expert
8871
+ {
8872
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8873
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8874
+
8875
+ // sigmoid
8876
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8877
+ cb(cur_gate, "ffn_shexp_gate", il);
8878
+
8879
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8880
+ model.layers[il].ffn_up_shexp, NULL,
8881
+ model.layers[il].ffn_gate_shexp, NULL,
8882
+ model.layers[il].ffn_down_shexp, NULL,
8883
+ NULL,
8884
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8885
+ cb(cur_ffn, "ffn_shexp", il);
8886
+
8887
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8888
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8889
+
8890
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8891
+ cb(moe_out, "ffn_out", il);
8892
+
8893
+ cur = moe_out;
8894
+ }
8895
+
8896
+ cur = ggml_add(ctx0, cur, ffn_inp);
8897
+ cb(cur, "l_out", il);
8898
+
8899
+ // input for next layer
8900
+ inpL = cur;
8901
+ }
8902
+
8903
+ cur = inpL;
8904
+
8905
+ cur = llm_build_norm(ctx0, cur, hparams,
8906
+ model.output_norm, NULL,
8907
+ LLM_NORM_RMS, cb, -1);
8908
+ cb(cur, "result_norm", -1);
8909
+
8910
+ // lm_head
8911
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8912
+ cb(cur, "result_output", -1);
8913
+
8914
+ ggml_build_forward_expand(gf, cur);
8915
+
8916
+ return gf;
8917
+ }
8918
+
8919
+ struct ggml_cgraph * build_phi2() {
8920
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8921
+
8922
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8923
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8924
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8925
+
8926
+ struct ggml_tensor * cur;
8927
+ struct ggml_tensor * attn_norm_output;
8928
+ struct ggml_tensor * ffn_output;
8929
+ struct ggml_tensor * inpL;
8930
+
8931
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8932
+
8933
+ // inp_pos - contains the positions
8934
+ struct ggml_tensor * inp_pos = build_inp_pos();
8935
+
8936
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8937
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8938
+
8939
+ for (int il = 0; il < n_layer; ++il) {
8940
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
8941
+ model.layers[il].attn_norm,
8942
+ model.layers[il].attn_norm_b,
8943
+ LLM_NORM, cb, il);
8944
+ cb(attn_norm_output, "attn_norm", il);
8945
+
8946
+ // self-attention
8947
+ {
8948
+ struct ggml_tensor * Qcur = nullptr;
8949
+ struct ggml_tensor * Kcur = nullptr;
8950
+ struct ggml_tensor * Vcur = nullptr;
8951
+
8952
+ if (model.layers[il].wqkv) {
8953
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
8954
+ cb(cur, "wqkv", il);
8955
+
8956
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8957
+ cb(cur, "bqkv", il);
8958
+
8959
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8960
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8961
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8962
+ } else {
8963
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8964
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8965
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8966
+ }
8967
+
8968
+ cb(Qcur, "Qcur", il);
8969
+ cb(Kcur, "Kcur", il);
8970
+ cb(Vcur, "Vcur", il);
8971
+
8972
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8973
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8974
+
8975
+ Qcur = ggml_rope_custom(
8976
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8977
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8978
+ );
8979
+ cb(Qcur, "Qcur", il);
8980
+
8981
+ // with phi2, we scale the Q to avoid precision issues
8982
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
8983
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
8984
+ cb(Qcur, "Qcur", il);
8985
+
8986
+ Kcur = ggml_rope_custom(
8987
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8988
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8989
+ );
8990
+ cb(Kcur, "Kcur", il);
8991
+
8992
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8993
+ model.layers[il].wo, model.layers[il].bo,
8994
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
+ }
8996
+
8997
+ if (il == n_layer - 1) {
8998
+ // skip computing output for unused tokens
8999
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9000
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9001
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9002
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
9003
+ }
9004
+
9005
+ // FF
9006
+ {
9007
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
9008
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
9009
+ NULL, NULL,
9010
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
9011
+ NULL,
9012
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
9013
+ cb(ffn_output, "ffn_out", il);
9014
+ }
9015
+
9016
+ cur = ggml_add(ctx0, cur, ffn_output);
9017
+ cb(cur, "l_out", il);
9018
+
9019
+ cur = ggml_add(ctx0, cur, inpL);
9020
+ cb(cur, "l_out", il);
9021
+
9022
+ inpL = cur;
9023
+ }
9024
+
9025
+ cur = llm_build_norm(ctx0, inpL, hparams,
9026
+ model.output_norm,
9027
+ model.output_norm_b,
9028
+ LLM_NORM, cb, -1);
9029
+ cb(cur, "result_norm", -1);
9030
+
8249
9031
  cur = ggml_mul_mat(ctx0, model.output, cur);
8250
- cb(cur, "result_output", -1);
9032
+ cb(cur, "result_output_no_bias", -1);
8251
9033
 
9034
+ cur = ggml_add(ctx0, cur, model.output_b);
9035
+ cb(cur, "result_output", -1);
8252
9036
  ggml_build_forward_expand(gf, cur);
8253
-
8254
9037
  return gf;
8255
9038
  }
8256
9039
 
8257
- struct ggml_cgraph * build_phi2() {
9040
+ struct ggml_cgraph * build_phi3() {
8258
9041
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8259
9042
 
8260
9043
  const int64_t n_embd_head = hparams.n_embd_head_v;
8261
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9044
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8262
9045
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8263
9046
 
8264
9047
  struct ggml_tensor * cur;
8265
- struct ggml_tensor * attn_norm_output;
8266
- struct ggml_tensor * ffn_output;
8267
9048
  struct ggml_tensor * inpL;
8268
9049
 
8269
9050
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8275,14 +9056,16 @@ struct llm_build_context {
8275
9056
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8276
9057
 
8277
9058
  for (int il = 0; il < n_layer; ++il) {
8278
- attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
8279
- model.layers[il].attn_norm,
8280
- model.layers[il].attn_norm_b,
8281
- LLM_NORM, cb, il);
8282
- cb(attn_norm_output, "attn_norm", il);
9059
+ auto residual = inpL;
8283
9060
 
8284
9061
  // self-attention
8285
9062
  {
9063
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9064
+ model.layers[il].attn_norm,
9065
+ NULL,
9066
+ LLM_NORM_RMS, cb, il);
9067
+ cb(attn_norm_output, "attn_norm", il);
9068
+
8286
9069
  struct ggml_tensor * Qcur = nullptr;
8287
9070
  struct ggml_tensor * Kcur = nullptr;
8288
9071
  struct ggml_tensor * Vcur = nullptr;
@@ -8291,13 +9074,11 @@ struct llm_build_context {
8291
9074
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
8292
9075
  cb(cur, "wqkv", il);
8293
9076
 
8294
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8295
- cb(cur, "bqkv", il);
8296
-
8297
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8298
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8299
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8300
- } else {
9077
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9078
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9079
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9080
+ }
9081
+ else {
8301
9082
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8302
9083
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8303
9084
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
@@ -8316,9 +9097,7 @@ struct llm_build_context {
8316
9097
  );
8317
9098
  cb(Qcur, "Qcur", il);
8318
9099
 
8319
- // with phi2, we scale the Q to avoid precision issues
8320
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
8321
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9100
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
8322
9101
  cb(Qcur, "Qcur", il);
8323
9102
 
8324
9103
  Kcur = ggml_rope_custom(
@@ -8328,48 +9107,58 @@ struct llm_build_context {
8328
9107
  cb(Kcur, "Kcur", il);
8329
9108
 
8330
9109
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8331
- model.layers[il].wo, model.layers[il].bo,
8332
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9110
+ model.layers[il].wo, NULL,
9111
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8333
9112
  }
8334
9113
 
8335
9114
  if (il == n_layer - 1) {
8336
9115
  // skip computing output for unused tokens
8337
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8338
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8339
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8340
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
9116
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9117
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9118
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8341
9119
  }
8342
9120
 
9121
+ cur = ggml_add(ctx0, cur, residual);
9122
+ residual = cur;
9123
+
9124
+ cur = llm_build_norm(ctx0, cur, hparams,
9125
+ model.layers[il].ffn_norm, NULL,
9126
+ LLM_NORM_RMS, cb, il);
9127
+ cb(cur, "ffn_norm", il);
9128
+
8343
9129
  // FF
9130
+ // special-case: the up and gate tensors are merged into a single tensor
9131
+ // TOOD: support into llm_build_ffn
8344
9132
  {
8345
- ffn_output = llm_build_ffn(ctx0, attn_norm_output,
8346
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8347
- NULL, NULL,
8348
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8349
- NULL,
8350
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8351
- cb(ffn_output, "ffn_out", il);
8352
- }
9133
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9134
+ cb(up, "ffn_up", il);
8353
9135
 
8354
- cur = ggml_add(ctx0, cur, ffn_output);
8355
- cb(cur, "l_out", il);
9136
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9137
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
8356
9138
 
8357
- cur = ggml_add(ctx0, cur, inpL);
9139
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9140
+ cb(y, "ffn_gate", il);
9141
+
9142
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9143
+ cb(down, "ffn_down", il);
9144
+
9145
+ cur = down;
9146
+ cb(cur, "ffn_out", il);
9147
+ }
9148
+
9149
+ cur = ggml_add(ctx0, residual, cur);
8358
9150
  cb(cur, "l_out", il);
8359
9151
 
8360
9152
  inpL = cur;
8361
9153
  }
8362
9154
 
8363
9155
  cur = llm_build_norm(ctx0, inpL, hparams,
8364
- model.output_norm,
8365
- model.output_norm_b,
8366
- LLM_NORM, cb, -1);
9156
+ model.output_norm,
9157
+ NULL,
9158
+ LLM_NORM_RMS, cb, -1);
8367
9159
  cb(cur, "result_norm", -1);
8368
9160
 
8369
9161
  cur = ggml_mul_mat(ctx0, model.output, cur);
8370
- cb(cur, "result_output_no_bias", -1);
8371
-
8372
- cur = ggml_add(ctx0, cur, model.output_b);
8373
9162
  cb(cur, "result_output", -1);
8374
9163
 
8375
9164
  ggml_build_forward_expand(gf, cur);
@@ -8377,6 +9166,7 @@ struct llm_build_context {
8377
9166
  return gf;
8378
9167
  }
8379
9168
 
9169
+
8380
9170
  struct ggml_cgraph * build_plamo() {
8381
9171
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8382
9172
 
@@ -9588,6 +10378,139 @@ struct llm_build_context {
9588
10378
  return gf;
9589
10379
 
9590
10380
  }
10381
+
10382
+ // ref: https://allenai.org/olmo
10383
+ // based on the original build_llama() function, changes:
10384
+ // * non-parametric layer norm
10385
+ // * clamp qkv
10386
+ // * removed bias
10387
+ // * removed MoE
10388
+ struct ggml_cgraph * build_olmo() {
10389
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10390
+
10391
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10392
+ int32_t n_tokens = this->n_tokens;
10393
+
10394
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10395
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10396
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10397
+
10398
+ struct ggml_tensor * cur;
10399
+ struct ggml_tensor * inpL;
10400
+
10401
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10402
+
10403
+ // inp_pos - contains the positions
10404
+ struct ggml_tensor * inp_pos = build_inp_pos();
10405
+
10406
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10407
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10408
+
10409
+ for (int il = 0; il < n_layer; ++il) {
10410
+ struct ggml_tensor * inpSA = inpL;
10411
+
10412
+ // norm
10413
+ cur = llm_build_norm(ctx0, inpL, hparams,
10414
+ NULL, NULL,
10415
+ LLM_NORM, cb, il);
10416
+ cb(cur, "attn_norm", il);
10417
+
10418
+ // self-attention
10419
+ {
10420
+ // compute Q and K and RoPE them
10421
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10422
+ cb(Qcur, "Qcur", il);
10423
+ if (hparams.f_clamp_kqv > 0.0f) {
10424
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10425
+ cb(Qcur, "Qcur", il);
10426
+ }
10427
+
10428
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10429
+ cb(Kcur, "Kcur", il);
10430
+ if (hparams.f_clamp_kqv > 0.0f) {
10431
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10432
+ cb(Kcur, "Kcur", il);
10433
+ }
10434
+
10435
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10436
+ cb(Vcur, "Vcur", il);
10437
+ if (hparams.f_clamp_kqv > 0.0f) {
10438
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10439
+ cb(Vcur, "Vcur", il);
10440
+ }
10441
+
10442
+ Qcur = ggml_rope_custom(
10443
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10444
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10445
+ ext_factor, attn_factor, beta_fast, beta_slow
10446
+ );
10447
+ cb(Qcur, "Qcur", il);
10448
+
10449
+ Kcur = ggml_rope_custom(
10450
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10451
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10452
+ ext_factor, attn_factor, beta_fast, beta_slow
10453
+ );
10454
+ cb(Kcur, "Kcur", il);
10455
+
10456
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10457
+ model.layers[il].wo, nullptr,
10458
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
+ }
10460
+
10461
+ if (il == n_layer - 1) {
10462
+ // skip computing output for unused tokens
10463
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10464
+ n_tokens = n_outputs;
10465
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10466
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10467
+ }
10468
+
10469
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10470
+ cb(ffn_inp, "ffn_inp", il);
10471
+
10472
+ // feed-forward network
10473
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10474
+ NULL, NULL,
10475
+ LLM_NORM, cb, il);
10476
+ cb(cur, "ffn_norm", il);
10477
+
10478
+ cur = llm_build_ffn(ctx0, cur,
10479
+ model.layers[il].ffn_up, NULL,
10480
+ model.layers[il].ffn_gate, NULL,
10481
+ model.layers[il].ffn_down, NULL,
10482
+ NULL,
10483
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10484
+ cb(cur, "ffn_out", il);
10485
+
10486
+ cur = ggml_add(ctx0, cur, ffn_inp);
10487
+ cb(cur, "ffn_out", il);
10488
+
10489
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10490
+ if (layer_dir != nullptr) {
10491
+ cur = ggml_add(ctx0, cur, layer_dir);
10492
+ }
10493
+ cb(cur, "l_out", il);
10494
+
10495
+ // input for next layer
10496
+ inpL = cur;
10497
+ }
10498
+
10499
+ cur = inpL;
10500
+
10501
+ cur = llm_build_norm(ctx0, cur, hparams,
10502
+ NULL, NULL,
10503
+ LLM_NORM, cb, -1);
10504
+ cb(cur, "result_norm", -1);
10505
+
10506
+ // lm_head
10507
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10508
+ cb(cur, "result_output", -1);
10509
+
10510
+ ggml_build_forward_expand(gf, cur);
10511
+
10512
+ return gf;
10513
+ }
9591
10514
  };
9592
10515
 
9593
10516
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9737,10 +10660,18 @@ static struct ggml_cgraph * llama_build_graph(
9737
10660
  {
9738
10661
  result = llm.build_qwen2();
9739
10662
  } break;
10663
+ case LLM_ARCH_QWEN2MOE:
10664
+ {
10665
+ result = llm.build_qwen2moe();
10666
+ } break;
9740
10667
  case LLM_ARCH_PHI2:
9741
10668
  {
9742
10669
  result = llm.build_phi2();
9743
10670
  } break;
10671
+ case LLM_ARCH_PHI3:
10672
+ {
10673
+ result = llm.build_phi3();
10674
+ } break;
9744
10675
  case LLM_ARCH_PLAMO:
9745
10676
  {
9746
10677
  result = llm.build_plamo();
@@ -9785,6 +10716,14 @@ static struct ggml_cgraph * llama_build_graph(
9785
10716
  {
9786
10717
  result = llm.build_command_r();
9787
10718
  } break;
10719
+ case LLM_ARCH_DBRX:
10720
+ {
10721
+ result = llm.build_dbrx();
10722
+ } break;
10723
+ case LLM_ARCH_OLMO:
10724
+ {
10725
+ result = llm.build_olmo();
10726
+ } break;
9788
10727
  default:
9789
10728
  GGML_ASSERT(false);
9790
10729
  }
@@ -12556,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
12556
13495
  GGML_ASSERT(ctx);
12557
13496
  const int64_t t_start_sample_us = ggml_time_us();
12558
13497
 
12559
- bool allow_eos = false;
13498
+ bool allow_eog = false;
12560
13499
  for (const auto & stack : grammar->stacks) {
12561
13500
  if (stack.empty()) {
12562
- allow_eos = true;
13501
+ allow_eog = true;
12563
13502
  break;
12564
13503
  }
12565
13504
  }
12566
13505
 
12567
- const llama_token eos = llama_token_eos(&ctx->model);
12568
-
12569
13506
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
12570
13507
  candidates_decoded.reserve(candidates->size);
12571
13508
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -12573,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
12573
13510
 
12574
13511
  for (size_t i = 0; i < candidates->size; ++i) {
12575
13512
  const llama_token id = candidates->data[i].id;
12576
- const std::string piece = llama_token_to_piece(ctx, id);
12577
- if (id == eos) {
12578
- if (!allow_eos) {
13513
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13514
+
13515
+ if (llama_token_is_eog(&ctx->model, id)) {
13516
+ if (!allow_eog) {
12579
13517
  candidates->data[i].logit = -INFINITY;
12580
13518
  }
12581
13519
  } else if (piece.empty() || piece[0] == 0) {
@@ -12738,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
12738
13676
  return result;
12739
13677
  }
12740
13678
 
12741
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13679
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
12742
13680
  GGML_ASSERT(ctx);
12743
13681
 
12744
13682
  const int64_t t_start_sample_us = ggml_time_us();
@@ -12751,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
12751
13689
  }
12752
13690
 
12753
13691
  std::discrete_distribution<> dist(probs.begin(), probs.end());
12754
- auto & rng = ctx->rng;
12755
13692
  int idx = dist(rng);
12756
13693
 
12757
13694
  llama_token result = candidates->data[idx].id;
@@ -12761,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
12761
13698
  return result;
12762
13699
  }
12763
13700
 
13701
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13702
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13703
+ }
13704
+
12764
13705
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
12765
13706
  const int64_t t_start_sample_us = ggml_time_us();
12766
13707
 
12767
- if (token == llama_token_eos(&ctx->model)) {
13708
+ if (llama_token_is_eog(&ctx->model, token)) {
12768
13709
  for (const auto & stack : grammar->stacks) {
12769
13710
  if (stack.empty()) {
12770
13711
  return;
@@ -12773,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12773
13714
  GGML_ASSERT(false);
12774
13715
  }
12775
13716
 
12776
- const std::string piece = llama_token_to_piece(ctx, token);
13717
+ const std::string piece = llama_token_to_piece(ctx, token, false);
12777
13718
 
12778
13719
  // Note terminating 0 in decoded string
12779
13720
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -12915,6 +13856,11 @@ struct llama_beam_search_data {
12915
13856
  }
12916
13857
  llama_logit_info logit_info(ctx);
12917
13858
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13859
+
13860
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13861
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13862
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13863
+
12918
13864
  size_t i=0;
12919
13865
  if (next_beams.size() < n_beams) {
12920
13866
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13535,6 +14481,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13535
14481
  gguf_set_kv (ctx_out, ml.meta);
13536
14482
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13537
14483
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14484
+ // Remove split metadata
14485
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14486
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14487
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13538
14488
 
13539
14489
  if (params->kv_overrides) {
13540
14490
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -13587,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13587
14537
  std::vector<no_init<uint8_t>> work;
13588
14538
  std::vector<no_init<float>> f32_conv_buf;
13589
14539
 
14540
+ uint16_t n_split = 1;
14541
+ // Assume split index is continuous
14542
+ if (params->keep_split) {
14543
+ for (int i = 0; i < ml.n_tensors; ++i) {
14544
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14545
+ }
14546
+ }
14547
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14548
+ ctx_outs[0] = ctx_out;
14549
+
13590
14550
  // populate the original tensors so we get an initial meta data
13591
14551
  for (int i = 0; i < ml.n_tensors; ++i) {
13592
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
13593
- gguf_add_tensor(ctx_out, meta);
14552
+ auto weight = ml.get_weight(i);
14553
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14554
+ struct ggml_tensor * tensor = weight->tensor;
14555
+ if (ctx_outs[i_split] == NULL) {
14556
+ ctx_outs[i_split] = gguf_init_empty();
14557
+ }
14558
+ gguf_add_tensor(ctx_outs[i_split], tensor);
13594
14559
  }
13595
14560
 
13596
- std::ofstream fout(fname_out, std::ios::binary);
13597
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
13598
-
13599
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14561
+ // Set split info if needed
14562
+ if (n_split > 1) {
14563
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14564
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14565
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14566
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14567
+ }
14568
+ }
13600
14569
 
13601
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14570
+ int cur_split = -1;
14571
+ std::ofstream fout;
14572
+ auto close_ofstream = [&]() {
14573
+ // Write metadata and close file handler
14574
+ if (fout.is_open()) {
14575
+ fout.seekp(0);
14576
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14577
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14578
+ fout.write((const char *) data.data(), data.size());
14579
+ fout.close();
14580
+ }
14581
+ };
14582
+ auto new_ofstream = [&](int index) {
14583
+ cur_split = index;
14584
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14585
+ std::string fname = fname_out;
14586
+ if (params->keep_split) {
14587
+ char split_path[PATH_MAX] = {0};
14588
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14589
+ fname = std::string(split_path);
14590
+ }
13602
14591
 
13603
- // placeholder for the meta data
13604
- ::zeros(fout, meta_size);
14592
+ fout = std::ofstream(fname, std::ios::binary);
14593
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14594
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14595
+ // placeholder for the meta data
14596
+ ::zeros(fout, meta_size);
14597
+ };
13605
14598
 
13606
14599
  const auto tn = LLM_TN(model.arch);
13607
-
14600
+ new_ofstream(0);
13608
14601
  for (int i = 0; i < ml.n_tensors; ++i) {
13609
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14602
+ auto weight = ml.get_weight(i);
14603
+ struct ggml_tensor * tensor = weight->tensor;
14604
+ if (weight->idx != cur_split && params->keep_split) {
14605
+ close_ofstream();
14606
+ new_ofstream(weight->idx);
14607
+ }
13610
14608
 
13611
14609
  const std::string name = ggml_get_name(tensor);
13612
14610
 
@@ -13761,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13761
14759
  total_size_new += new_size;
13762
14760
 
13763
14761
  // update the gguf meta data as we go
13764
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
13765
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14762
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14763
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
13766
14764
 
13767
14765
  // write tensor data + padding
13768
14766
  fout.write((const char *) new_data, new_size);
13769
14767
  zeros(fout, GGML_PAD(new_size, align) - new_size);
13770
14768
  }
13771
-
13772
- // go back to beginning of file and write the updated meta data
13773
- {
13774
- fout.seekp(0);
13775
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
13776
- gguf_get_meta_data(ctx_out, data.data());
13777
- fout.write((const char *) data.data(), data.size());
14769
+ close_ofstream();
14770
+ for (auto & c:ctx_outs) {
14771
+ gguf_free(c);
13778
14772
  }
13779
14773
 
13780
- fout.close();
13781
-
13782
- gguf_free(ctx_out);
13783
-
13784
14774
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
13785
14775
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
13786
14776
 
@@ -14136,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14136
15126
  /*.quantize_output_tensor =*/ true,
14137
15127
  /*.only_copy =*/ false,
14138
15128
  /*.pure =*/ false,
15129
+ /*.keep_split =*/ false,
14139
15130
  /*.imatrix =*/ nullptr,
14140
15131
  /*.kv_overrides =*/ nullptr,
14141
15132
  };
@@ -14629,18 +15620,22 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14629
15620
  case LLM_ARCH_MINICPM:
14630
15621
  case LLM_ARCH_XVERSE:
14631
15622
  case LLM_ARCH_COMMAND_R:
15623
+ case LLM_ARCH_OLMO:
14632
15624
  return LLAMA_ROPE_TYPE_NORM;
14633
15625
 
14634
15626
  // the pairs of head values are offset by n_rot/2
14635
15627
  case LLM_ARCH_FALCON:
14636
15628
  case LLM_ARCH_GROK:
15629
+ case LLM_ARCH_DBRX:
14637
15630
  case LLM_ARCH_PERSIMMON:
14638
15631
  case LLM_ARCH_BERT:
14639
15632
  case LLM_ARCH_NOMIC_BERT:
14640
15633
  case LLM_ARCH_STABLELM:
14641
15634
  case LLM_ARCH_QWEN:
14642
15635
  case LLM_ARCH_QWEN2:
15636
+ case LLM_ARCH_QWEN2MOE:
14643
15637
  case LLM_ARCH_PHI2:
15638
+ case LLM_ARCH_PHI3:
14644
15639
  case LLM_ARCH_GEMMA:
14645
15640
  case LLM_ARCH_STARCODER2:
14646
15641
  return LLAMA_ROPE_TYPE_NEOX;
@@ -14654,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14654
15649
  return LLAMA_ROPE_TYPE_NONE;
14655
15650
  }
14656
15651
 
15652
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15653
+ return ctx->cparams.pooling_type;
15654
+ }
15655
+
14657
15656
  int32_t llama_n_vocab(const struct llama_model * model) {
14658
15657
  return model->hparams.n_vocab;
14659
15658
  }
@@ -15132,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
15132
16131
  *
15133
16132
  */
15134
16133
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16134
+ llama_synchronize(ctx);
16135
+
15135
16136
  // copy rng
15136
16137
  {
15137
16138
  std::ostringstream rng_ss;
@@ -15284,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15284
16285
 
15285
16286
  // Sets the state reading from the specified source address
15286
16287
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16288
+ llama_synchronize(ctx);
16289
+
15287
16290
  const uint8_t * inp = src;
15288
16291
 
15289
16292
  // set rng
@@ -15320,6 +16323,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15320
16323
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15321
16324
  ctx->output_ids[id] = i;
15322
16325
  }
16326
+
16327
+ ctx->n_outputs = n_outputs;
15323
16328
  }
15324
16329
  }
15325
16330
 
@@ -15586,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
15586
16591
  }
15587
16592
 
15588
16593
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16594
+ llama_synchronize(ctx);
16595
+
15589
16596
  const auto & kv_self = ctx->kv_self;
15590
16597
  GGML_ASSERT(!kv_self.recurrent); // not implemented
15591
16598
 
@@ -15703,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
15703
16710
  }
15704
16711
 
15705
16712
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16713
+ llama_synchronize(ctx);
16714
+
15706
16715
  auto & kv_self = ctx->kv_self;
15707
16716
  GGML_ASSERT(!kv_self.recurrent); // not implemented
15708
16717
 
@@ -16154,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16154
17163
  return model->vocab.id_to_token[token].type;
16155
17164
  }
16156
17165
 
17166
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17167
+ return token != -1 && (
17168
+ token == llama_token_eos(model) ||
17169
+ token == llama_token_eot(model)
17170
+ );
17171
+ }
17172
+
16157
17173
  llama_token llama_token_bos(const struct llama_model * model) {
16158
17174
  return model->vocab.special_bos_id;
16159
17175
  }
@@ -16231,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
16231
17247
  }
16232
17248
 
16233
17249
  // does not write null-terminator to buf
16234
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17250
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16235
17251
  if (0 <= token && token < llama_n_vocab(model)) {
16236
17252
  switch (llama_vocab_get_type(model->vocab)) {
16237
17253
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16246,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16246
17262
  }
16247
17263
  memcpy(buf, result.c_str(), result.length());
16248
17264
  return result.length();
16249
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17265
+ } else if (
17266
+ (llama_is_user_defined_token(model->vocab, token)) ||
17267
+ (llama_is_control_token (model->vocab, token) && special)) {
16250
17268
  std::string result = model->vocab.id_to_token[token].text;
16251
17269
  if (length < (int) result.length()) {
16252
17270
  return -(int) result.length();
@@ -16259,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16259
17277
  }
16260
17278
  memcpy(buf, "\xe2\x96\x85", 3);
16261
17279
  return 3;
16262
- } else if (llama_is_control_token(model->vocab, token)) {
16263
- ;
16264
17280
  } else if (llama_is_byte_token(model->vocab, token)) {
16265
17281
  if (length < 1) {
16266
17282
  return -1;
@@ -16281,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16281
17297
  }
16282
17298
  memcpy(buf, result.c_str(), result.length());
16283
17299
  return result.length();
16284
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17300
+ } else if (
17301
+ (llama_is_user_defined_token(model->vocab, token)) ||
17302
+ (llama_is_control_token (model->vocab, token) && special)) {
16285
17303
  std::string result = model->vocab.id_to_token[token].text;
16286
17304
  if (length < (int) result.length()) {
16287
17305
  return -(int) result.length();
16288
17306
  }
16289
17307
  memcpy(buf, result.c_str(), result.length());
16290
17308
  return result.length();
16291
- } else if (llama_is_control_token(model->vocab, token)) {
16292
- ;
16293
17309
  }
16294
17310
  break;
16295
17311
  }
@@ -16472,6 +17488,39 @@ static int32_t llama_chat_apply_template_internal(
16472
17488
  if (add_ass) {
16473
17489
  ss << "### Response:\n";
16474
17490
  }
17491
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17492
+ // CohereForAI/c4ai-command-r-plus
17493
+ for (auto message : chat) {
17494
+ std::string role(message->role);
17495
+ if (role == "system") {
17496
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17497
+ } else if (role == "user") {
17498
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17499
+ } else if (role == "assistant") {
17500
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17501
+ }
17502
+ }
17503
+ if (add_ass) {
17504
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17505
+ }
17506
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17507
+ // Llama 3
17508
+ for (auto message : chat) {
17509
+ std::string role(message->role);
17510
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17511
+ }
17512
+ if (add_ass) {
17513
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17514
+ }
17515
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17516
+ // Phi 3
17517
+ for (auto message : chat) {
17518
+ std::string role(message->role);
17519
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17520
+ }
17521
+ if (add_ass) {
17522
+ ss << "<|assistant|>\n";
17523
+ }
16475
17524
  } else {
16476
17525
  // template not supported
16477
17526
  return -1;
@@ -16604,6 +17653,11 @@ const char * llama_print_system_info(void) {
16604
17653
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
16605
17654
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
16606
17655
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
+ #ifdef GGML_USE_LLAMAFILE
17657
+ s += "LAMMAFILE = 1 | ";
17658
+ #else
17659
+ s += "LAMMAFILE = 0 | ";
17660
+ #endif
16607
17661
 
16608
17662
  return s.c_str();
16609
17663
  }