llama_cpp 0.14.5 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,7 +209,9 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
214
+ LLM_ARCH_PHI3,
213
215
  LLM_ARCH_PLAMO,
214
216
  LLM_ARCH_CODESHELL,
215
217
  LLM_ARCH_ORION,
@@ -220,6 +222,8 @@ enum llm_arch {
220
222
  LLM_ARCH_MAMBA,
221
223
  LLM_ARCH_XVERSE,
222
224
  LLM_ARCH_COMMAND_R,
225
+ LLM_ARCH_DBRX,
226
+ LLM_ARCH_OLMO,
223
227
  LLM_ARCH_UNKNOWN,
224
228
  };
225
229
 
@@ -241,7 +245,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
245
  { LLM_ARCH_STABLELM, "stablelm" },
242
246
  { LLM_ARCH_QWEN, "qwen" },
243
247
  { LLM_ARCH_QWEN2, "qwen2" },
248
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
249
  { LLM_ARCH_PHI2, "phi2" },
250
+ { LLM_ARCH_PHI3, "phi3" },
245
251
  { LLM_ARCH_PLAMO, "plamo" },
246
252
  { LLM_ARCH_CODESHELL, "codeshell" },
247
253
  { LLM_ARCH_ORION, "orion" },
@@ -252,6 +258,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
258
  { LLM_ARCH_MAMBA, "mamba" },
253
259
  { LLM_ARCH_XVERSE, "xverse" },
254
260
  { LLM_ARCH_COMMAND_R, "command-r" },
261
+ { LLM_ARCH_DBRX, "dbrx" },
262
+ { LLM_ARCH_OLMO, "olmo" },
255
263
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
264
  };
257
265
 
@@ -325,6 +333,10 @@ enum llm_kv {
325
333
  LLM_KV_TOKENIZER_ADD_PREFIX,
326
334
  LLM_KV_TOKENIZER_HF_JSON,
327
335
  LLM_KV_TOKENIZER_RWKV,
336
+ LLM_KV_TOKENIZER_PREFIX_ID,
337
+ LLM_KV_TOKENIZER_SUFFIX_ID,
338
+ LLM_KV_TOKENIZER_MIDDLE_ID,
339
+ LLM_KV_TOKENIZER_EOT_ID,
328
340
  };
329
341
 
330
342
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -397,6 +409,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
397
409
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
398
410
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
399
411
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
412
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
413
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
414
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
415
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
400
416
  };
401
417
 
402
418
  struct LLM_KV {
@@ -427,6 +443,7 @@ enum llm_tensor {
427
443
  LLM_TENSOR_ATTN_OUT_NORM,
428
444
  LLM_TENSOR_ATTN_ROT_EMBD,
429
445
  LLM_TENSOR_FFN_GATE_INP,
446
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
430
447
  LLM_TENSOR_FFN_NORM,
431
448
  LLM_TENSOR_FFN_GATE,
432
449
  LLM_TENSOR_FFN_DOWN,
@@ -438,6 +455,9 @@ enum llm_tensor {
438
455
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
439
456
  LLM_TENSOR_FFN_GATE_EXPS,
440
457
  LLM_TENSOR_FFN_UP_EXPS,
458
+ LLM_TENSOR_FFN_DOWN_SHEXP,
459
+ LLM_TENSOR_FFN_GATE_SHEXP,
460
+ LLM_TENSOR_FFN_UP_SHEXP,
441
461
  LLM_TENSOR_ATTN_Q_NORM,
442
462
  LLM_TENSOR_ATTN_K_NORM,
443
463
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -700,6 +720,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
700
720
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
701
721
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
702
722
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
723
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
724
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
703
725
  },
704
726
  },
705
727
  {
@@ -735,6 +757,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
735
757
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
736
758
  },
737
759
  },
760
+ {
761
+ LLM_ARCH_QWEN2MOE,
762
+ {
763
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
764
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
765
+ { LLM_TENSOR_OUTPUT, "output" },
766
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
767
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
768
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
769
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
770
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
771
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
772
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
773
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
774
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
775
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
776
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
777
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
778
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
779
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
780
+ },
781
+ },
738
782
  {
739
783
  LLM_ARCH_PHI2,
740
784
  {
@@ -751,6 +795,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
751
795
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
752
796
  },
753
797
  },
798
+ {
799
+ LLM_ARCH_PHI3,
800
+ {
801
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
802
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
803
+ { LLM_TENSOR_OUTPUT, "output" },
804
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
805
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
806
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
807
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
808
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
809
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
810
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
811
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
812
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
813
+ },
814
+ },
754
815
  {
755
816
  LLM_ARCH_PLAMO,
756
817
  {
@@ -934,6 +995,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
934
995
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
935
996
  },
936
997
  },
998
+ {
999
+ LLM_ARCH_DBRX,
1000
+ {
1001
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1002
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1003
+ { LLM_TENSOR_OUTPUT, "output" },
1004
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1005
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1006
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1007
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
1008
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1009
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1010
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1011
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1012
+ },
1013
+ },
1014
+ {
1015
+ LLM_ARCH_OLMO,
1016
+ {
1017
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1018
+ { LLM_TENSOR_OUTPUT, "output" },
1019
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1020
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1021
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1022
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1023
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1024
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1025
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1026
+ },
1027
+ },
937
1028
  {
938
1029
  LLM_ARCH_UNKNOWN,
939
1030
  {
@@ -1528,12 +1619,12 @@ struct llama_mlock {
1528
1619
  };
1529
1620
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1530
1621
 
1531
- static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1622
+ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1532
1623
  std::vector<char> result(8, 0);
1533
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1624
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1534
1625
  if (n_tokens < 0) {
1535
1626
  result.resize(-n_tokens);
1536
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1627
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
1537
1628
  GGML_ASSERT(check == -n_tokens);
1538
1629
  }
1539
1630
  else {
@@ -1690,6 +1781,7 @@ enum e_model {
1690
1781
  MODEL_4B,
1691
1782
  MODEL_7B,
1692
1783
  MODEL_8B,
1784
+ MODEL_12B,
1693
1785
  MODEL_13B,
1694
1786
  MODEL_14B,
1695
1787
  MODEL_15B,
@@ -1705,8 +1797,10 @@ enum e_model {
1705
1797
  MODEL_MEDIUM,
1706
1798
  MODEL_LARGE,
1707
1799
  MODEL_XL,
1800
+ MODEL_A2_7B,
1708
1801
  MODEL_8x7B,
1709
1802
  MODEL_8x22B,
1803
+ MODEL_16x12B,
1710
1804
  };
1711
1805
 
1712
1806
  static const size_t kiB = 1024;
@@ -1890,6 +1984,12 @@ struct llama_layer {
1890
1984
  struct ggml_tensor * ffn_down_exps;
1891
1985
  struct ggml_tensor * ffn_up_exps ;
1892
1986
 
1987
+ // ff shared expert (shexp)
1988
+ struct ggml_tensor * ffn_gate_inp_shexp;
1989
+ struct ggml_tensor * ffn_gate_shexp;
1990
+ struct ggml_tensor * ffn_down_shexp;
1991
+ struct ggml_tensor * ffn_up_shexp;
1992
+
1893
1993
  // ff bias
1894
1994
  struct ggml_tensor * ffn_down_b; // b2
1895
1995
  struct ggml_tensor * ffn_up_b; // b3
@@ -2036,10 +2136,10 @@ struct llama_vocab {
2036
2136
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2037
2137
 
2038
2138
  id linefeed_id = 13;
2039
- id special_prefix_id = 32007;
2040
- id special_middle_id = 32009;
2041
- id special_suffix_id = 32008;
2042
- id special_eot_id = 32010;
2139
+ id special_prefix_id = -1;
2140
+ id special_suffix_id = -1;
2141
+ id special_middle_id = -1;
2142
+ id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2043
2143
 
2044
2144
  bool add_space_prefix = true;
2045
2145
 
@@ -2899,9 +2999,13 @@ struct llama_model_loader {
2899
2999
 
2900
3000
  ggml_tensor * tensor;
2901
3001
 
2902
- llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
3002
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2903
3003
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2904
3004
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
3005
+
3006
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
3007
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
3008
+ }
2905
3009
  }
2906
3010
  };
2907
3011
  std::vector<llama_tensor_weight> weights;
@@ -2940,15 +3044,15 @@ struct llama_model_loader {
2940
3044
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
2941
3045
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
2942
3046
 
3047
+ files.emplace_back(new llama_file(fname.c_str(), "rb"));
3048
+ contexts.emplace_back(ctx);
3049
+
2943
3050
  // Save tensors data offset of the main file.
2944
3051
  // For subsidiary files, `meta` tensor data offset must not be used,
2945
3052
  // so we build a unified tensors index for weights.
2946
3053
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2947
- weights.emplace_back(0, cur->name, meta, cur);
3054
+ weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
2948
3055
  }
2949
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
2950
- contexts.emplace_back(ctx);
2951
-
2952
3056
  uint16_t n_split = 0;
2953
3057
  get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
2954
3058
 
@@ -2982,12 +3086,13 @@ struct llama_model_loader {
2982
3086
  throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
2983
3087
  }
2984
3088
 
3089
+ files.emplace_back(new llama_file(split_path, "rb"));
3090
+ contexts.emplace_back(ctx);
3091
+
2985
3092
  // Save tensors data offset info of the shard.
2986
3093
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2987
- weights.emplace_back(idx, cur->name, ctx_gguf, cur);
3094
+ weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
2988
3095
  }
2989
- files.emplace_back(new llama_file(split_path, "rb"));
2990
- contexts.emplace_back(ctx);
2991
3096
 
2992
3097
  gguf_free(ctx_gguf);
2993
3098
  }
@@ -3197,6 +3302,10 @@ struct llama_model_loader {
3197
3302
  return nullptr;
3198
3303
  }
3199
3304
 
3305
+ const llama_tensor_weight * get_weight(int i) const {
3306
+ return get_weight(get_tensor_name(i));
3307
+ }
3308
+
3200
3309
  const llama_tensor_weight & require_weight(const char * name) const {
3201
3310
  const llama_tensor_weight * weight = get_weight(name);
3202
3311
  if (!weight) {
@@ -3545,6 +3654,7 @@ static const char * llama_model_type_name(e_model type) {
3545
3654
  case MODEL_3B: return "3B";
3546
3655
  case MODEL_7B: return "7B";
3547
3656
  case MODEL_8B: return "8B";
3657
+ case MODEL_12B: return "12B";
3548
3658
  case MODEL_13B: return "13B";
3549
3659
  case MODEL_14B: return "14B";
3550
3660
  case MODEL_15B: return "15B";
@@ -3560,8 +3670,10 @@ static const char * llama_model_type_name(e_model type) {
3560
3670
  case MODEL_MEDIUM: return "0.4B";
3561
3671
  case MODEL_LARGE: return "0.8B";
3562
3672
  case MODEL_XL: return "1.5B";
3673
+ case MODEL_A2_7B: return "A2.7B";
3563
3674
  case MODEL_8x7B: return "8x7B";
3564
3675
  case MODEL_8x22B: return "8x22B";
3676
+ case MODEL_16x12B: return "16x12B";
3565
3677
  default: return "?B";
3566
3678
  }
3567
3679
  }
@@ -3686,7 +3798,7 @@ static void llm_load_hparams(
3686
3798
  switch (hparams.n_layer) {
3687
3799
  case 22: model.type = e_model::MODEL_1B; break;
3688
3800
  case 26: model.type = e_model::MODEL_3B; break;
3689
- case 32: model.type = e_model::MODEL_7B; break;
3801
+ case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3690
3802
  case 40: model.type = e_model::MODEL_13B; break;
3691
3803
  case 48: model.type = e_model::MODEL_34B; break;
3692
3804
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3834,6 +3946,7 @@ static void llm_load_hparams(
3834
3946
  switch (hparams.n_layer) {
3835
3947
  case 24: model.type = e_model::MODEL_1B; break;
3836
3948
  case 32: model.type = e_model::MODEL_3B; break;
3949
+ case 40: model.type = e_model::MODEL_12B; break;
3837
3950
  default: model.type = e_model::MODEL_UNKNOWN;
3838
3951
  }
3839
3952
  } break;
@@ -3858,10 +3971,28 @@ static void llm_load_hparams(
3858
3971
  default: model.type = e_model::MODEL_UNKNOWN;
3859
3972
  }
3860
3973
  } break;
3974
+ case LLM_ARCH_QWEN2MOE:
3975
+ {
3976
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3977
+ switch (hparams.n_layer) {
3978
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3979
+ default: model.type = e_model::MODEL_UNKNOWN;
3980
+ }
3981
+ } break;
3861
3982
  case LLM_ARCH_PHI2:
3862
3983
  {
3863
3984
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3864
3985
 
3986
+ switch (hparams.n_layer) {
3987
+ case 24: model.type = e_model::MODEL_1B; break;
3988
+ case 32: model.type = e_model::MODEL_3B; break;
3989
+ default: model.type = e_model::MODEL_UNKNOWN;
3990
+ }
3991
+ } break;
3992
+ case LLM_ARCH_PHI3:
3993
+ {
3994
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3995
+
3865
3996
  switch (hparams.n_layer) {
3866
3997
  case 24: model.type = e_model::MODEL_1B; break;
3867
3998
  case 32: model.type = e_model::MODEL_3B; break;
@@ -3983,6 +4114,28 @@ static void llm_load_hparams(
3983
4114
  default: model.type = e_model::MODEL_UNKNOWN;
3984
4115
  }
3985
4116
  } break;
4117
+ case LLM_ARCH_DBRX:
4118
+ {
4119
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4120
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4121
+
4122
+ switch (hparams.n_layer) {
4123
+ case 40: model.type = e_model::MODEL_16x12B; break;
4124
+ default: model.type = e_model::MODEL_UNKNOWN;
4125
+ }
4126
+ } break;
4127
+ case LLM_ARCH_OLMO:
4128
+ {
4129
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4130
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4131
+
4132
+ switch (hparams.n_layer) {
4133
+ case 22: model.type = e_model::MODEL_1B; break;
4134
+ case 32: model.type = e_model::MODEL_7B; break;
4135
+ case 80: model.type = e_model::MODEL_70B; break;
4136
+ default: model.type = e_model::MODEL_UNKNOWN;
4137
+ }
4138
+ } break;
3986
4139
  default: (void)0;
3987
4140
  }
3988
4141
 
@@ -4042,6 +4195,35 @@ static void llm_load_vocab(
4042
4195
  vocab.special_cls_id = -1;
4043
4196
  vocab.special_mask_id = -1;
4044
4197
 
4198
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4199
+ // prior to support of FIM special tokens in GGUF, the following
4200
+ // will allow those models to continue to work. The general names
4201
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4202
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4203
+ // new versions of these models have been published.
4204
+ std::string gen_name;
4205
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4206
+
4207
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4208
+ [](unsigned char c){ return std::tolower(c); });
4209
+
4210
+ if (gen_name.find("code") != std::string::npos) {
4211
+ if (model.arch == LLM_ARCH_LLAMA) {
4212
+ vocab.special_prefix_id = 32007;
4213
+ vocab.special_suffix_id = 32008;
4214
+ vocab.special_middle_id = 32009;
4215
+ vocab.special_eot_id = 32010;
4216
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4217
+ vocab.special_prefix_id = 67;
4218
+ vocab.special_suffix_id = 69;
4219
+ vocab.special_middle_id = 68;
4220
+ // TODO: this is not EOT, it is "file separator" token, needs fix
4221
+ // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4222
+ //vocab.special_eot_id = 70;
4223
+ vocab.special_eot_id = 107;
4224
+ }
4225
+ }
4226
+
4045
4227
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4046
4228
  if (add_space_prefix_keyidx != -1) {
4047
4229
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4155,14 +4337,19 @@ static void llm_load_vocab(
4155
4337
  // special tokens
4156
4338
  {
4157
4339
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4158
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4340
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4341
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4342
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4343
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4344
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4345
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4346
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4347
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4348
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4349
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4350
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4165
4351
  };
4352
+
4166
4353
  for (const auto & it : special_token_types) {
4167
4354
  const std::string & key = kv(std::get<0>(it));
4168
4355
  int32_t & id = std::get<1>(it);
@@ -4177,7 +4364,6 @@ static void llm_load_vocab(
4177
4364
  } else {
4178
4365
  id = new_id;
4179
4366
  }
4180
-
4181
4367
  }
4182
4368
 
4183
4369
  // Handle add_bos_token and add_eos_token
@@ -4191,6 +4377,28 @@ static void llm_load_vocab(
4191
4377
  vocab.special_add_eos = int(temp);
4192
4378
  }
4193
4379
  }
4380
+
4381
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
4382
+ //
4383
+ // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
4384
+ // for now, we apply this workaround to find the EOT token based on its text
4385
+ if (vocab.special_eot_id == -1) {
4386
+ for (const auto & t : vocab.token_to_id) {
4387
+ if (
4388
+ // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
4389
+ // need to fix convert script
4390
+ //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
4391
+ (t.first == "<|eot_id|>" ||
4392
+ t.first == "<|im_end|>" ||
4393
+ t.first == "<|end|>" ||
4394
+ t.first == "<end_of_turn>"
4395
+ )
4396
+ ) {
4397
+ vocab.special_eot_id = t.second;
4398
+ break;
4399
+ }
4400
+ }
4401
+ }
4194
4402
  }
4195
4403
 
4196
4404
  // build special tokens cache
@@ -4353,14 +4561,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
4353
4561
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
4354
4562
 
4355
4563
  // special tokens
4356
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4357
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4358
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4359
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4360
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4361
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4362
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4363
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4564
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
4565
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
4566
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
4567
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
4568
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
4569
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
4570
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
4571
+
4572
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
4573
+ if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
4574
+ if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
4575
+ if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
4576
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
4364
4577
  }
4365
4578
 
4366
4579
  // Returns false if cancelled by progress_callback
@@ -4378,6 +4591,13 @@ static bool llm_load_tensors(
4378
4591
 
4379
4592
  auto & hparams = model.hparams;
4380
4593
 
4594
+ #ifdef GGML_USE_SYCL
4595
+ // disable MoE with SYCL until mul_mat_id is updated
4596
+ if (hparams.n_expert > 0) {
4597
+ n_gpu_layers = 0;
4598
+ }
4599
+ #endif
4600
+
4381
4601
  model.split_mode = split_mode;
4382
4602
  model.main_gpu = main_gpu;
4383
4603
  model.n_gpu_layers = n_gpu_layers;
@@ -4475,7 +4695,7 @@ static bool llm_load_tensors(
4475
4695
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4476
4696
 
4477
4697
  // for moe merged tensors
4478
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4698
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4479
4699
 
4480
4700
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4481
4701
  for (auto & it : buft_layer_count) {
@@ -4671,6 +4891,39 @@ static bool llm_load_tensors(
4671
4891
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4672
4892
  }
4673
4893
  } break;
4894
+ case LLM_ARCH_DBRX:
4895
+ {
4896
+ if (n_expert == 0) {
4897
+ throw std::runtime_error("DBRX model cannot have zero experts");
4898
+ }
4899
+
4900
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4901
+
4902
+ // output
4903
+ {
4904
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4905
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4906
+ }
4907
+
4908
+ for (int i = 0; i < n_layer; ++i) {
4909
+ ggml_context * ctx_layer = ctx_for_layer(i);
4910
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4911
+
4912
+ auto & layer = model.layers[i];
4913
+
4914
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4915
+
4916
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4917
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4918
+
4919
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4920
+
4921
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4922
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4923
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4924
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4925
+ }
4926
+ } break;
4674
4927
  case LLM_ARCH_BAICHUAN:
4675
4928
  {
4676
4929
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4985,8 +5238,13 @@ static bool llm_load_tensors(
4985
5238
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4986
5239
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4987
5240
 
4988
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4989
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5241
+ // optional q and k layernorms, present in StableLM 2 12B
5242
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5243
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5244
+
5245
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5246
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5247
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4990
5248
 
4991
5249
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4992
5250
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5029,7 +5287,13 @@ static bool llm_load_tensors(
5029
5287
  // output
5030
5288
  {
5031
5289
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5032
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5290
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5291
+ // if output is NULL, init from the input tok embed
5292
+ if (model.output == NULL) {
5293
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5294
+ ml.n_created--; // artificial tensor
5295
+ ml.size_data += ggml_nbytes(model.output);
5296
+ }
5033
5297
  }
5034
5298
 
5035
5299
  for (int i = 0; i < n_layer; ++i) {
@@ -5057,6 +5321,54 @@ static bool llm_load_tensors(
5057
5321
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5058
5322
  }
5059
5323
  } break;
5324
+ case LLM_ARCH_QWEN2MOE:
5325
+ {
5326
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5327
+
5328
+ // output
5329
+ {
5330
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5331
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5332
+ }
5333
+
5334
+ for (int i = 0; i < n_layer; ++i) {
5335
+ ggml_context * ctx_layer = ctx_for_layer(i);
5336
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5337
+
5338
+ auto & layer = model.layers[i];
5339
+
5340
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5341
+
5342
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5343
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5344
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5345
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5346
+
5347
+ // optional bias tensors
5348
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5349
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5350
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5351
+
5352
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5353
+
5354
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5355
+
5356
+ GGML_ASSERT(hparams.n_expert > 0);
5357
+ GGML_ASSERT(hparams.n_expert_used > 0);
5358
+
5359
+ // MoE branch
5360
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5361
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5362
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5363
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5364
+
5365
+ // Shared expert branch
5366
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5367
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5368
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5369
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5370
+ }
5371
+ } break;
5060
5372
  case LLM_ARCH_PHI2:
5061
5373
  {
5062
5374
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5102,6 +5414,33 @@ static bool llm_load_tensors(
5102
5414
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
5103
5415
  }
5104
5416
  } break;
5417
+ case LLM_ARCH_PHI3:
5418
+ {
5419
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
5420
+
5421
+ // output
5422
+ {
5423
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
5424
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
5425
+ }
5426
+
5427
+ for (int i = 0; i < n_layer; ++i) {
5428
+ ggml_context* ctx_layer = ctx_for_layer(i);
5429
+ ggml_context* ctx_split = ctx_for_layer_split(i);
5430
+
5431
+ auto& layer = model.layers[i];
5432
+
5433
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
5434
+
5435
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false);
5436
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
5437
+
5438
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
5439
+
5440
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
5441
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
5442
+ }
5443
+ } break;
5105
5444
  case LLM_ARCH_PLAMO:
5106
5445
  {
5107
5446
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5450,6 +5789,37 @@ static bool llm_load_tensors(
5450
5789
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5451
5790
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5452
5791
 
5792
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5793
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5794
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5795
+ }
5796
+ } break;
5797
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5798
+ {
5799
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5800
+
5801
+ // output
5802
+ {
5803
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5804
+ // if output is NULL, init from the input tok embed
5805
+ if (model.output == NULL) {
5806
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5807
+ ml.n_created--; // artificial tensor
5808
+ ml.size_data += ggml_nbytes(model.output);
5809
+ }
5810
+ }
5811
+
5812
+ for (int i = 0; i < n_layer; ++i) {
5813
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5814
+
5815
+ auto & layer = model.layers[i];
5816
+
5817
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5818
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5819
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5820
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5821
+
5822
+
5453
5823
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5454
5824
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5455
5825
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5890,6 +6260,100 @@ static struct ggml_tensor * llm_build_ffn(
5890
6260
  return cur;
5891
6261
  }
5892
6262
 
6263
+ static struct ggml_tensor * llm_build_moe_ffn(
6264
+ struct ggml_context * ctx,
6265
+ struct ggml_tensor * cur,
6266
+ struct ggml_tensor * gate_inp,
6267
+ struct ggml_tensor * up_exps,
6268
+ struct ggml_tensor * gate_exps,
6269
+ struct ggml_tensor * down_exps,
6270
+ int64_t n_expert,
6271
+ int64_t n_expert_used,
6272
+ llm_ffn_op_type type_op,
6273
+ bool norm_w,
6274
+ const llm_build_cb & cb,
6275
+ int il) {
6276
+ int64_t n_embd = cur->ne[0];
6277
+ int64_t n_tokens = cur->ne[1];
6278
+
6279
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6280
+ cb(logits, "ffn_moe_logits", il);
6281
+
6282
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6283
+ cb(probs, "ffn_moe_probs", il);
6284
+
6285
+ // select experts
6286
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6287
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6288
+ cb(selected_experts, "ffn_moe_topk", il);
6289
+
6290
+ ggml_tensor * weights = ggml_get_rows(ctx,
6291
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6292
+ cb(weights, "ffn_moe_weights", il);
6293
+
6294
+ if (norm_w) {
6295
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6296
+
6297
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6298
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6299
+
6300
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6301
+ cb(weights, "ffn_moe_weights_norm", il);
6302
+
6303
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6304
+ }
6305
+
6306
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6307
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6308
+ cb(up, "ffn_moe_up", il);
6309
+
6310
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6311
+ cb(gate, "ffn_moe_gate", il);
6312
+
6313
+ switch (type_op) {
6314
+ case LLM_FFN_SILU:
6315
+ {
6316
+ gate = ggml_silu(ctx, gate);
6317
+ cb(gate, "ffn_moe_silu", il);
6318
+ } break;
6319
+ case LLM_FFN_GELU:
6320
+ {
6321
+ gate = ggml_gelu(ctx, gate);
6322
+ cb(gate, "ffn_moe_gelu", il);
6323
+ } break;
6324
+ default:
6325
+ GGML_ASSERT(false);
6326
+ }
6327
+
6328
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6329
+ cb(par, "ffn_moe_gate_par", il);
6330
+
6331
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6332
+ cb(experts, "ffn_moe_down", il);
6333
+
6334
+ experts = ggml_mul(ctx, experts, weights);
6335
+
6336
+ // aggregate experts
6337
+ ggml_tensor * moe_out = nullptr;
6338
+ for (int i = 0; i < n_expert_used; ++i) {
6339
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6340
+ experts->nb[2], i*experts->nb[1]);
6341
+
6342
+ if (i == 0) {
6343
+ moe_out = cur_expert;
6344
+ } else {
6345
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6346
+ }
6347
+ }
6348
+
6349
+ if (n_expert_used == 1) {
6350
+ // avoid returning a non-contiguous tensor
6351
+ moe_out = ggml_cont(ctx, moe_out);
6352
+ }
6353
+
6354
+ return moe_out;
6355
+ }
6356
+
5893
6357
  // if max_alibi_bias > 0 then apply ALiBi
5894
6358
  static struct ggml_tensor * llm_build_kqv(
5895
6359
  struct ggml_context * ctx,
@@ -5928,7 +6392,7 @@ static struct ggml_tensor * llm_build_kqv(
5928
6392
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
5929
6393
  cb(kq, "kq", il);
5930
6394
 
5931
- if (model.arch == LLM_ARCH_PHI2) {
6395
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
5932
6396
  // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
5933
6397
  // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
5934
6398
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -6433,63 +6897,16 @@ struct llm_build_context {
6433
6897
  LLM_NORM_RMS, cb, il);
6434
6898
  cb(cur, "ffn_norm", il);
6435
6899
 
6436
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6437
- cb(logits, "ffn_moe_logits", il);
6438
-
6439
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6440
- cb(probs, "ffn_moe_probs", il);
6441
-
6442
- // select experts
6443
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6444
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6445
-
6446
- ggml_tensor * weights = ggml_get_rows(ctx0,
6447
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6448
- cb(weights, "ffn_moe_weights", il);
6449
-
6450
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6451
-
6452
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6453
- cb(weights_sum, "ffn_moe_weights_sum", il);
6454
-
6455
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6456
- cb(weights, "ffn_moe_weights_norm", il);
6457
-
6458
- // compute expert outputs
6459
- ggml_tensor * moe_out = nullptr;
6460
-
6461
- for (int i = 0; i < n_expert_used; ++i) {
6462
- ggml_tensor * cur_expert;
6463
-
6464
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6465
- cb(cur_up, "ffn_moe_up", il);
6466
-
6467
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6468
- cb(cur_gate, "ffn_moe_gate", il);
6469
-
6470
- cur_gate = ggml_silu(ctx0, cur_gate);
6471
- cb(cur_gate, "ffn_moe_silu", il);
6472
-
6473
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6474
- cb(cur_expert, "ffn_moe_gate_par", il);
6475
-
6476
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6477
- cb(cur_expert, "ffn_moe_down", il);
6478
-
6479
- cur_expert = ggml_mul(ctx0, cur_expert,
6480
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6481
- cb(cur_expert, "ffn_moe_weighted", il);
6482
-
6483
- if (i == 0) {
6484
- moe_out = cur_expert;
6485
- } else {
6486
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6487
- cb(moe_out, "ffn_moe_out", il);
6488
- }
6489
- }
6490
-
6491
- cur = moe_out;
6492
- }
6900
+ cur = llm_build_moe_ffn(ctx0, cur,
6901
+ model.layers[il].ffn_gate_inp,
6902
+ model.layers[il].ffn_up_exps,
6903
+ model.layers[il].ffn_gate_exps,
6904
+ model.layers[il].ffn_down_exps,
6905
+ n_expert, n_expert_used,
6906
+ LLM_FFN_SILU, true,
6907
+ cb, il);
6908
+ cb(cur, "ffn_moe_out", il);
6909
+ }
6493
6910
 
6494
6911
  cur = ggml_add(ctx0, cur, ffn_inp);
6495
6912
  cb(cur, "ffn_out", il);
@@ -6967,74 +7384,158 @@ struct llm_build_context {
6967
7384
  LLM_NORM_RMS, cb, il);
6968
7385
  cb(cur, "ffn_norm", il);
6969
7386
 
6970
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6971
- cb(logits, "ffn_moe_logits", il);
7387
+ cur = llm_build_moe_ffn(ctx0, cur,
7388
+ model.layers[il].ffn_gate_inp,
7389
+ model.layers[il].ffn_up_exps,
7390
+ model.layers[il].ffn_gate_exps,
7391
+ model.layers[il].ffn_down_exps,
7392
+ n_expert, n_expert_used,
7393
+ LLM_FFN_GELU, true,
7394
+ cb, il);
7395
+ cb(cur, "ffn_moe_out", il);
7396
+
7397
+ // Grok
7398
+ // if layer_out_norm is present then apply it before adding the input
7399
+ // Idea: maybe ffn_out_norm is a better name
7400
+ if (model.layers[il].layer_out_norm) {
7401
+ cur = llm_build_norm(ctx0, cur, hparams,
7402
+ model.layers[il].layer_out_norm, NULL,
7403
+ LLM_NORM_RMS, cb, il);
7404
+ cb(cur, "layer_out_norm", il);
7405
+ }
7406
+
7407
+ cur = ggml_add(ctx0, cur, ffn_inp);
7408
+ cb(cur, "ffn_out", il);
7409
+
7410
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7411
+ if (layer_dir != nullptr) {
7412
+ cur = ggml_add(ctx0, cur, layer_dir);
7413
+ }
7414
+ cb(cur, "l_out", il);
7415
+
7416
+ // input for next layer
7417
+ inpL = cur;
7418
+ }
7419
+
7420
+ cur = inpL;
7421
+
7422
+ cur = llm_build_norm(ctx0, cur, hparams,
7423
+ model.output_norm, NULL,
7424
+ LLM_NORM_RMS, cb, -1);
7425
+ cb(cur, "result_norm", -1);
7426
+
7427
+ // lm_head
7428
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7429
+
7430
+ // Grok
7431
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
6972
7432
 
6973
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6974
- cb(probs, "ffn_moe_probs", il);
7433
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
6975
7434
 
6976
- // select experts
6977
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6978
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
7435
+ cb(cur, "result_output", -1);
6979
7436
 
6980
- ggml_tensor * weights = ggml_get_rows(ctx0,
6981
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6982
- cb(weights, "ffn_moe_weights", il);
7437
+ ggml_build_forward_expand(gf, cur);
6983
7438
 
6984
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
7439
+ return gf;
7440
+ }
7441
+
7442
+ struct ggml_cgraph * build_dbrx() {
7443
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6985
7444
 
6986
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6987
- cb(weights_sum, "ffn_moe_weights_sum", il);
7445
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7446
+ int32_t n_tokens = this->n_tokens;
6988
7447
 
6989
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6990
- cb(weights, "ffn_moe_weights_norm", il);
7448
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7449
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7450
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7451
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6991
7452
 
6992
- // compute expert outputs
6993
- ggml_tensor * moe_out = nullptr;
7453
+ struct ggml_tensor * cur;
7454
+ struct ggml_tensor * inpL;
6994
7455
 
6995
- for (int i = 0; i < n_expert_used; ++i) {
6996
- ggml_tensor * cur_expert;
7456
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6997
7457
 
6998
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6999
- cb(cur_up, "ffn_moe_up", il);
7458
+ // inp_pos - contains the positions
7459
+ struct ggml_tensor * inp_pos = build_inp_pos();
7000
7460
 
7001
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
7002
- cb(cur_gate, "ffn_moe_gate", il);
7461
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7462
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7003
7463
 
7004
- //GeLU
7005
- cur_gate = ggml_gelu(ctx0, cur_gate);
7006
- cb(cur_gate, "ffn_moe_gelu", il);
7464
+ for (int il = 0; il < n_layer; ++il) {
7465
+ struct ggml_tensor * inpSA = inpL;
7007
7466
 
7008
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
7009
- cb(cur_expert, "ffn_moe_gate_par", il);
7467
+ // norm
7468
+ cur = llm_build_norm(ctx0, inpL, hparams,
7469
+ model.layers[il].attn_norm, NULL,
7470
+ LLM_NORM, cb, il);
7471
+ cb(cur, "attn_norm", il);
7010
7472
 
7011
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
7012
- cb(cur_expert, "ffn_moe_down", il);
7473
+ // self-attention
7474
+ {
7475
+ struct ggml_tensor * Qcur = nullptr;
7476
+ struct ggml_tensor * Kcur = nullptr;
7477
+ struct ggml_tensor * Vcur = nullptr;
7013
7478
 
7014
- cur_expert = ggml_mul(ctx0, cur_expert,
7015
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
7016
- cb(cur_expert, "ffn_moe_weighted", il);
7479
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7480
+ cb(cur, "wqkv", il);
7017
7481
 
7018
- if (i == 0) {
7019
- moe_out = cur_expert;
7020
- } else {
7021
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
7022
- cb(moe_out, "ffn_moe_out", il);
7023
- }
7024
- }
7482
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7483
+ cb(cur, "wqkv_clamped", il);
7025
7484
 
7026
- cur = moe_out;
7485
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7486
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7487
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7027
7488
 
7028
- // Grok
7029
- // if layer_out_norm is present then apply it before adding the input
7030
- // Idea: maybe ffn_out_norm is a better name
7031
- if (model.layers[il].layer_out_norm) {
7032
- cur = llm_build_norm(ctx0, cur, hparams,
7033
- model.layers[il].layer_out_norm, NULL,
7034
- LLM_NORM_RMS, cb, il);
7035
- cb(cur, "layer_out_norm", il);
7489
+ cb(Qcur, "Qcur", il);
7490
+ cb(Kcur, "Kcur", il);
7491
+ cb(Vcur, "Vcur", il);
7492
+
7493
+ Qcur = ggml_rope_custom(
7494
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7495
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7496
+ ext_factor, attn_factor, beta_fast, beta_slow
7497
+ );
7498
+ cb(Qcur, "Qcur", il);
7499
+
7500
+ Kcur = ggml_rope_custom(
7501
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7502
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7503
+ ext_factor, attn_factor, beta_fast, beta_slow
7504
+ );
7505
+ cb(Kcur, "Kcur", il);
7506
+
7507
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7508
+ model.layers[il].wo, NULL,
7509
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7510
  }
7037
7511
 
7512
+ if (il == n_layer - 1) {
7513
+ // skip computing output for unused tokens
7514
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7515
+ n_tokens = n_outputs;
7516
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7517
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7518
+ }
7519
+
7520
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7521
+ cb(ffn_inp, "ffn_inp", il);
7522
+
7523
+ // feed-forward network
7524
+ // MoE branch
7525
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7526
+ model.layers[il].attn_out_norm, NULL,
7527
+ LLM_NORM, cb, il);
7528
+ cb(cur, "attn_out_norm", il);
7529
+
7530
+ cur = llm_build_moe_ffn(ctx0, cur,
7531
+ model.layers[il].ffn_gate_inp,
7532
+ model.layers[il].ffn_up_exps,
7533
+ model.layers[il].ffn_gate_exps,
7534
+ model.layers[il].ffn_down_exps,
7535
+ n_expert, n_expert_used,
7536
+ LLM_FFN_SILU, true,
7537
+ cb, il);
7538
+ cb(cur, "ffn_moe_out", il);
7038
7539
 
7039
7540
  cur = ggml_add(ctx0, cur, ffn_inp);
7040
7541
  cb(cur, "ffn_out", il);
@@ -7052,18 +7553,13 @@ struct llm_build_context {
7052
7553
  cur = inpL;
7053
7554
 
7054
7555
  cur = llm_build_norm(ctx0, cur, hparams,
7055
- model.output_norm, NULL,
7056
- LLM_NORM_RMS, cb, -1);
7556
+ model.output_norm, NULL,
7557
+ LLM_NORM, cb, -1);
7057
7558
  cb(cur, "result_norm", -1);
7058
7559
 
7059
7560
  // lm_head
7060
7561
  cur = ggml_mul_mat(ctx0, model.output, cur);
7061
7562
 
7062
- // Grok
7063
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7064
-
7065
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7066
-
7067
7563
  cb(cur, "result_output", -1);
7068
7564
 
7069
7565
  ggml_build_forward_expand(gf, cur);
@@ -7923,7 +8419,7 @@ struct llm_build_context {
7923
8419
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7924
8420
 
7925
8421
  for (int il = 0; il < n_layer; ++il) {
7926
- struct ggml_tensor * inpSA = inpL;
8422
+
7927
8423
 
7928
8424
  // norm
7929
8425
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7932,6 +8428,8 @@ struct llm_build_context {
7932
8428
  LLM_NORM, cb, il);
7933
8429
  cb(cur, "attn_norm", il);
7934
8430
 
8431
+ struct ggml_tensor * inpSA = cur;
8432
+
7935
8433
  // self-attention
7936
8434
  {
7937
8435
  // compute Q and K and RoPE them
@@ -7956,15 +8454,36 @@ struct llm_build_context {
7956
8454
  cb(Vcur, "Vcur", il);
7957
8455
  }
7958
8456
 
8457
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8458
+ cb(Qcur, "Qcur", il);
8459
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8460
+ cb(Kcur, "Kcur", il);
8461
+
8462
+ if (model.layers[il].attn_q_norm) {
8463
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8464
+ model.layers[il].attn_q_norm,
8465
+ NULL,
8466
+ LLM_NORM, cb, il);
8467
+ cb(Qcur, "Qcur", il);
8468
+ }
8469
+ if (model.layers[il].attn_k_norm) {
8470
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8471
+ model.layers[il].attn_k_norm,
8472
+ NULL,
8473
+ LLM_NORM, cb, il);
8474
+ cb(Kcur, "Kcur", il);
8475
+ }
8476
+
8477
+
7959
8478
  Qcur = ggml_rope_custom(
7960
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8479
+ ctx0, Qcur, inp_pos,
7961
8480
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7962
8481
  ext_factor, attn_factor, beta_fast, beta_slow
7963
8482
  );
7964
8483
  cb(Qcur, "Qcur", il);
7965
8484
 
7966
8485
  Kcur = ggml_rope_custom(
7967
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8486
+ ctx0, Kcur, inp_pos,
7968
8487
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7969
8488
  ext_factor, attn_factor, beta_fast, beta_slow
7970
8489
  );
@@ -7979,20 +8498,25 @@ struct llm_build_context {
7979
8498
  // skip computing output for unused tokens
7980
8499
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7981
8500
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8501
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7982
8502
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7983
8503
  }
7984
8504
 
7985
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8505
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7986
8506
  cb(ffn_inp, "ffn_inp", il);
7987
8507
 
7988
8508
  // feed-forward network
7989
8509
  {
7990
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7991
- model.layers[il].ffn_norm,
7992
- model.layers[il].ffn_norm_b,
7993
- LLM_NORM, cb, il);
7994
- cb(cur, "ffn_norm", il);
7995
-
8510
+ if (model.layers[il].ffn_norm) {
8511
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8512
+ model.layers[il].ffn_norm,
8513
+ model.layers[il].ffn_norm_b,
8514
+ LLM_NORM, cb, il);
8515
+ cb(cur, "ffn_norm", il);
8516
+ } else {
8517
+ // parallel residual
8518
+ cur = inpSA;
8519
+ }
7996
8520
  cur = llm_build_ffn(ctx0, cur,
7997
8521
  model.layers[il].ffn_up, NULL,
7998
8522
  model.layers[il].ffn_gate, NULL,
@@ -8182,12 +8706,6 @@ struct llm_build_context {
8182
8706
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8183
8707
  cb(Vcur, "Vcur", il);
8184
8708
 
8185
- // these nodes are added to the graph together so that they are not reordered
8186
- // by doing so, the number of splits in the graph is reduced
8187
- ggml_build_forward_expand(gf, Qcur);
8188
- ggml_build_forward_expand(gf, Kcur);
8189
- ggml_build_forward_expand(gf, Vcur);
8190
-
8191
8709
  Qcur = ggml_rope_custom(
8192
8710
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8193
8711
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8245,25 +8763,288 @@ struct llm_build_context {
8245
8763
  LLM_NORM_RMS, cb, -1);
8246
8764
  cb(cur, "result_norm", -1);
8247
8765
 
8248
- // lm_head
8766
+ // lm_head
8767
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8768
+ cb(cur, "result_output", -1);
8769
+
8770
+ ggml_build_forward_expand(gf, cur);
8771
+
8772
+ return gf;
8773
+ }
8774
+
8775
+ struct ggml_cgraph * build_qwen2moe() {
8776
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8777
+
8778
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8779
+ int32_t n_tokens = this->n_tokens;
8780
+
8781
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8782
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8783
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8784
+
8785
+ struct ggml_tensor * cur;
8786
+ struct ggml_tensor * inpL;
8787
+
8788
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8789
+
8790
+ // inp_pos - contains the positions
8791
+ struct ggml_tensor * inp_pos = build_inp_pos();
8792
+
8793
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8794
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8795
+
8796
+ for (int il = 0; il < n_layer; ++il) {
8797
+ struct ggml_tensor * inpSA = inpL;
8798
+
8799
+ // norm
8800
+ cur = llm_build_norm(ctx0, inpL, hparams,
8801
+ model.layers[il].attn_norm, NULL,
8802
+ LLM_NORM_RMS, cb, il);
8803
+ cb(cur, "attn_norm", il);
8804
+
8805
+ // self_attention
8806
+ {
8807
+ // compute Q and K and RoPE them
8808
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8809
+ cb(Qcur, "Qcur", il);
8810
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8811
+ cb(Qcur, "Qcur", il);
8812
+
8813
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8814
+ cb(Kcur, "Kcur", il);
8815
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8816
+ cb(Kcur, "Kcur", il);
8817
+
8818
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8819
+ cb(Vcur, "Vcur", il);
8820
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8821
+ cb(Vcur, "Vcur", il);
8822
+
8823
+ Qcur = ggml_rope_custom(
8824
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8825
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8826
+ ext_factor, attn_factor, beta_fast, beta_slow
8827
+ );
8828
+ cb(Qcur, "Qcur", il);
8829
+
8830
+ Kcur = ggml_rope_custom(
8831
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8832
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8833
+ ext_factor, attn_factor, beta_fast, beta_slow
8834
+ );
8835
+ cb(Kcur, "Kcur", il);
8836
+
8837
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8838
+ model.layers[il].wo, model.layers[il].bo,
8839
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8840
+ }
8841
+
8842
+ if (il == n_layer - 1) {
8843
+ // skip computing output for unused tokens
8844
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8845
+ n_tokens = n_outputs;
8846
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8847
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8848
+ }
8849
+
8850
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8851
+ cb(ffn_inp, "ffn_inp", il);
8852
+
8853
+ // MoE branch
8854
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8855
+ model.layers[il].ffn_norm, NULL,
8856
+ LLM_NORM_RMS, cb, il);
8857
+ cb(cur, "ffn_norm", il);
8858
+
8859
+ ggml_tensor * moe_out =
8860
+ llm_build_moe_ffn(ctx0, cur,
8861
+ model.layers[il].ffn_gate_inp,
8862
+ model.layers[il].ffn_up_exps,
8863
+ model.layers[il].ffn_gate_exps,
8864
+ model.layers[il].ffn_down_exps,
8865
+ n_expert, n_expert_used,
8866
+ LLM_FFN_SILU, false,
8867
+ cb, il);
8868
+ cb(cur, "ffn_moe_out", il);
8869
+
8870
+ // FFN shared expert
8871
+ {
8872
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8873
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8874
+
8875
+ // sigmoid
8876
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8877
+ cb(cur_gate, "ffn_shexp_gate", il);
8878
+
8879
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8880
+ model.layers[il].ffn_up_shexp, NULL,
8881
+ model.layers[il].ffn_gate_shexp, NULL,
8882
+ model.layers[il].ffn_down_shexp, NULL,
8883
+ NULL,
8884
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8885
+ cb(cur_ffn, "ffn_shexp", il);
8886
+
8887
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8888
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8889
+
8890
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8891
+ cb(moe_out, "ffn_out", il);
8892
+
8893
+ cur = moe_out;
8894
+ }
8895
+
8896
+ cur = ggml_add(ctx0, cur, ffn_inp);
8897
+ cb(cur, "l_out", il);
8898
+
8899
+ // input for next layer
8900
+ inpL = cur;
8901
+ }
8902
+
8903
+ cur = inpL;
8904
+
8905
+ cur = llm_build_norm(ctx0, cur, hparams,
8906
+ model.output_norm, NULL,
8907
+ LLM_NORM_RMS, cb, -1);
8908
+ cb(cur, "result_norm", -1);
8909
+
8910
+ // lm_head
8911
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8912
+ cb(cur, "result_output", -1);
8913
+
8914
+ ggml_build_forward_expand(gf, cur);
8915
+
8916
+ return gf;
8917
+ }
8918
+
8919
+ struct ggml_cgraph * build_phi2() {
8920
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8921
+
8922
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8923
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8924
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8925
+
8926
+ struct ggml_tensor * cur;
8927
+ struct ggml_tensor * attn_norm_output;
8928
+ struct ggml_tensor * ffn_output;
8929
+ struct ggml_tensor * inpL;
8930
+
8931
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8932
+
8933
+ // inp_pos - contains the positions
8934
+ struct ggml_tensor * inp_pos = build_inp_pos();
8935
+
8936
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8937
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8938
+
8939
+ for (int il = 0; il < n_layer; ++il) {
8940
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
8941
+ model.layers[il].attn_norm,
8942
+ model.layers[il].attn_norm_b,
8943
+ LLM_NORM, cb, il);
8944
+ cb(attn_norm_output, "attn_norm", il);
8945
+
8946
+ // self-attention
8947
+ {
8948
+ struct ggml_tensor * Qcur = nullptr;
8949
+ struct ggml_tensor * Kcur = nullptr;
8950
+ struct ggml_tensor * Vcur = nullptr;
8951
+
8952
+ if (model.layers[il].wqkv) {
8953
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
8954
+ cb(cur, "wqkv", il);
8955
+
8956
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8957
+ cb(cur, "bqkv", il);
8958
+
8959
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8960
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8961
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8962
+ } else {
8963
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8964
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8965
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8966
+ }
8967
+
8968
+ cb(Qcur, "Qcur", il);
8969
+ cb(Kcur, "Kcur", il);
8970
+ cb(Vcur, "Vcur", il);
8971
+
8972
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8973
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8974
+
8975
+ Qcur = ggml_rope_custom(
8976
+ ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8977
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8978
+ );
8979
+ cb(Qcur, "Qcur", il);
8980
+
8981
+ // with phi2, we scale the Q to avoid precision issues
8982
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
8983
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
8984
+ cb(Qcur, "Qcur", il);
8985
+
8986
+ Kcur = ggml_rope_custom(
8987
+ ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
8988
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
8989
+ );
8990
+ cb(Kcur, "Kcur", il);
8991
+
8992
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8993
+ model.layers[il].wo, model.layers[il].bo,
8994
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8995
+ }
8996
+
8997
+ if (il == n_layer - 1) {
8998
+ // skip computing output for unused tokens
8999
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9000
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9001
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9002
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
9003
+ }
9004
+
9005
+ // FF
9006
+ {
9007
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
9008
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
9009
+ NULL, NULL,
9010
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
9011
+ NULL,
9012
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
9013
+ cb(ffn_output, "ffn_out", il);
9014
+ }
9015
+
9016
+ cur = ggml_add(ctx0, cur, ffn_output);
9017
+ cb(cur, "l_out", il);
9018
+
9019
+ cur = ggml_add(ctx0, cur, inpL);
9020
+ cb(cur, "l_out", il);
9021
+
9022
+ inpL = cur;
9023
+ }
9024
+
9025
+ cur = llm_build_norm(ctx0, inpL, hparams,
9026
+ model.output_norm,
9027
+ model.output_norm_b,
9028
+ LLM_NORM, cb, -1);
9029
+ cb(cur, "result_norm", -1);
9030
+
8249
9031
  cur = ggml_mul_mat(ctx0, model.output, cur);
8250
- cb(cur, "result_output", -1);
9032
+ cb(cur, "result_output_no_bias", -1);
8251
9033
 
9034
+ cur = ggml_add(ctx0, cur, model.output_b);
9035
+ cb(cur, "result_output", -1);
8252
9036
  ggml_build_forward_expand(gf, cur);
8253
-
8254
9037
  return gf;
8255
9038
  }
8256
9039
 
8257
- struct ggml_cgraph * build_phi2() {
9040
+ struct ggml_cgraph * build_phi3() {
8258
9041
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8259
9042
 
8260
9043
  const int64_t n_embd_head = hparams.n_embd_head_v;
8261
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
9044
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8262
9045
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8263
9046
 
8264
9047
  struct ggml_tensor * cur;
8265
- struct ggml_tensor * attn_norm_output;
8266
- struct ggml_tensor * ffn_output;
8267
9048
  struct ggml_tensor * inpL;
8268
9049
 
8269
9050
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8275,14 +9056,16 @@ struct llm_build_context {
8275
9056
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8276
9057
 
8277
9058
  for (int il = 0; il < n_layer; ++il) {
8278
- attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
8279
- model.layers[il].attn_norm,
8280
- model.layers[il].attn_norm_b,
8281
- LLM_NORM, cb, il);
8282
- cb(attn_norm_output, "attn_norm", il);
9059
+ auto residual = inpL;
8283
9060
 
8284
9061
  // self-attention
8285
9062
  {
9063
+ struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
9064
+ model.layers[il].attn_norm,
9065
+ NULL,
9066
+ LLM_NORM_RMS, cb, il);
9067
+ cb(attn_norm_output, "attn_norm", il);
9068
+
8286
9069
  struct ggml_tensor * Qcur = nullptr;
8287
9070
  struct ggml_tensor * Kcur = nullptr;
8288
9071
  struct ggml_tensor * Vcur = nullptr;
@@ -8291,13 +9074,11 @@ struct llm_build_context {
8291
9074
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
8292
9075
  cb(cur, "wqkv", il);
8293
9076
 
8294
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8295
- cb(cur, "bqkv", il);
8296
-
8297
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8298
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8299
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8300
- } else {
9077
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
9078
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
9079
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9080
+ }
9081
+ else {
8301
9082
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8302
9083
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8303
9084
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
@@ -8316,9 +9097,7 @@ struct llm_build_context {
8316
9097
  );
8317
9098
  cb(Qcur, "Qcur", il);
8318
9099
 
8319
- // with phi2, we scale the Q to avoid precision issues
8320
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
8321
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
9100
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
8322
9101
  cb(Qcur, "Qcur", il);
8323
9102
 
8324
9103
  Kcur = ggml_rope_custom(
@@ -8328,48 +9107,58 @@ struct llm_build_context {
8328
9107
  cb(Kcur, "Kcur", il);
8329
9108
 
8330
9109
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8331
- model.layers[il].wo, model.layers[il].bo,
8332
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9110
+ model.layers[il].wo, NULL,
9111
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
8333
9112
  }
8334
9113
 
8335
9114
  if (il == n_layer - 1) {
8336
9115
  // skip computing output for unused tokens
8337
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8338
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8339
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8340
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
9116
+ struct ggml_tensor* inp_out_ids = build_inp_out_ids();
9117
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9118
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
8341
9119
  }
8342
9120
 
9121
+ cur = ggml_add(ctx0, cur, residual);
9122
+ residual = cur;
9123
+
9124
+ cur = llm_build_norm(ctx0, cur, hparams,
9125
+ model.layers[il].ffn_norm, NULL,
9126
+ LLM_NORM_RMS, cb, il);
9127
+ cb(cur, "ffn_norm", il);
9128
+
8343
9129
  // FF
9130
+ // special-case: the up and gate tensors are merged into a single tensor
9131
+ // TOOD: support into llm_build_ffn
8344
9132
  {
8345
- ffn_output = llm_build_ffn(ctx0, attn_norm_output,
8346
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
8347
- NULL, NULL,
8348
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8349
- NULL,
8350
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8351
- cb(ffn_output, "ffn_out", il);
8352
- }
9133
+ struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
9134
+ cb(up, "ffn_up", il);
8353
9135
 
8354
- cur = ggml_add(ctx0, cur, ffn_output);
8355
- cb(cur, "l_out", il);
9136
+ auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
9137
+ auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
8356
9138
 
8357
- cur = ggml_add(ctx0, cur, inpL);
9139
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
9140
+ cb(y, "ffn_gate", il);
9141
+
9142
+ auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
9143
+ cb(down, "ffn_down", il);
9144
+
9145
+ cur = down;
9146
+ cb(cur, "ffn_out", il);
9147
+ }
9148
+
9149
+ cur = ggml_add(ctx0, residual, cur);
8358
9150
  cb(cur, "l_out", il);
8359
9151
 
8360
9152
  inpL = cur;
8361
9153
  }
8362
9154
 
8363
9155
  cur = llm_build_norm(ctx0, inpL, hparams,
8364
- model.output_norm,
8365
- model.output_norm_b,
8366
- LLM_NORM, cb, -1);
9156
+ model.output_norm,
9157
+ NULL,
9158
+ LLM_NORM_RMS, cb, -1);
8367
9159
  cb(cur, "result_norm", -1);
8368
9160
 
8369
9161
  cur = ggml_mul_mat(ctx0, model.output, cur);
8370
- cb(cur, "result_output_no_bias", -1);
8371
-
8372
- cur = ggml_add(ctx0, cur, model.output_b);
8373
9162
  cb(cur, "result_output", -1);
8374
9163
 
8375
9164
  ggml_build_forward_expand(gf, cur);
@@ -8377,6 +9166,7 @@ struct llm_build_context {
8377
9166
  return gf;
8378
9167
  }
8379
9168
 
9169
+
8380
9170
  struct ggml_cgraph * build_plamo() {
8381
9171
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
8382
9172
 
@@ -9588,6 +10378,139 @@ struct llm_build_context {
9588
10378
  return gf;
9589
10379
 
9590
10380
  }
10381
+
10382
+ // ref: https://allenai.org/olmo
10383
+ // based on the original build_llama() function, changes:
10384
+ // * non-parametric layer norm
10385
+ // * clamp qkv
10386
+ // * removed bias
10387
+ // * removed MoE
10388
+ struct ggml_cgraph * build_olmo() {
10389
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10390
+
10391
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10392
+ int32_t n_tokens = this->n_tokens;
10393
+
10394
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10395
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10396
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10397
+
10398
+ struct ggml_tensor * cur;
10399
+ struct ggml_tensor * inpL;
10400
+
10401
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10402
+
10403
+ // inp_pos - contains the positions
10404
+ struct ggml_tensor * inp_pos = build_inp_pos();
10405
+
10406
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10407
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10408
+
10409
+ for (int il = 0; il < n_layer; ++il) {
10410
+ struct ggml_tensor * inpSA = inpL;
10411
+
10412
+ // norm
10413
+ cur = llm_build_norm(ctx0, inpL, hparams,
10414
+ NULL, NULL,
10415
+ LLM_NORM, cb, il);
10416
+ cb(cur, "attn_norm", il);
10417
+
10418
+ // self-attention
10419
+ {
10420
+ // compute Q and K and RoPE them
10421
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10422
+ cb(Qcur, "Qcur", il);
10423
+ if (hparams.f_clamp_kqv > 0.0f) {
10424
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10425
+ cb(Qcur, "Qcur", il);
10426
+ }
10427
+
10428
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10429
+ cb(Kcur, "Kcur", il);
10430
+ if (hparams.f_clamp_kqv > 0.0f) {
10431
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10432
+ cb(Kcur, "Kcur", il);
10433
+ }
10434
+
10435
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10436
+ cb(Vcur, "Vcur", il);
10437
+ if (hparams.f_clamp_kqv > 0.0f) {
10438
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10439
+ cb(Vcur, "Vcur", il);
10440
+ }
10441
+
10442
+ Qcur = ggml_rope_custom(
10443
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10444
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10445
+ ext_factor, attn_factor, beta_fast, beta_slow
10446
+ );
10447
+ cb(Qcur, "Qcur", il);
10448
+
10449
+ Kcur = ggml_rope_custom(
10450
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10451
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10452
+ ext_factor, attn_factor, beta_fast, beta_slow
10453
+ );
10454
+ cb(Kcur, "Kcur", il);
10455
+
10456
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10457
+ model.layers[il].wo, nullptr,
10458
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10459
+ }
10460
+
10461
+ if (il == n_layer - 1) {
10462
+ // skip computing output for unused tokens
10463
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10464
+ n_tokens = n_outputs;
10465
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10466
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10467
+ }
10468
+
10469
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10470
+ cb(ffn_inp, "ffn_inp", il);
10471
+
10472
+ // feed-forward network
10473
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10474
+ NULL, NULL,
10475
+ LLM_NORM, cb, il);
10476
+ cb(cur, "ffn_norm", il);
10477
+
10478
+ cur = llm_build_ffn(ctx0, cur,
10479
+ model.layers[il].ffn_up, NULL,
10480
+ model.layers[il].ffn_gate, NULL,
10481
+ model.layers[il].ffn_down, NULL,
10482
+ NULL,
10483
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10484
+ cb(cur, "ffn_out", il);
10485
+
10486
+ cur = ggml_add(ctx0, cur, ffn_inp);
10487
+ cb(cur, "ffn_out", il);
10488
+
10489
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10490
+ if (layer_dir != nullptr) {
10491
+ cur = ggml_add(ctx0, cur, layer_dir);
10492
+ }
10493
+ cb(cur, "l_out", il);
10494
+
10495
+ // input for next layer
10496
+ inpL = cur;
10497
+ }
10498
+
10499
+ cur = inpL;
10500
+
10501
+ cur = llm_build_norm(ctx0, cur, hparams,
10502
+ NULL, NULL,
10503
+ LLM_NORM, cb, -1);
10504
+ cb(cur, "result_norm", -1);
10505
+
10506
+ // lm_head
10507
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10508
+ cb(cur, "result_output", -1);
10509
+
10510
+ ggml_build_forward_expand(gf, cur);
10511
+
10512
+ return gf;
10513
+ }
9591
10514
  };
9592
10515
 
9593
10516
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9737,10 +10660,18 @@ static struct ggml_cgraph * llama_build_graph(
9737
10660
  {
9738
10661
  result = llm.build_qwen2();
9739
10662
  } break;
10663
+ case LLM_ARCH_QWEN2MOE:
10664
+ {
10665
+ result = llm.build_qwen2moe();
10666
+ } break;
9740
10667
  case LLM_ARCH_PHI2:
9741
10668
  {
9742
10669
  result = llm.build_phi2();
9743
10670
  } break;
10671
+ case LLM_ARCH_PHI3:
10672
+ {
10673
+ result = llm.build_phi3();
10674
+ } break;
9744
10675
  case LLM_ARCH_PLAMO:
9745
10676
  {
9746
10677
  result = llm.build_plamo();
@@ -9785,6 +10716,14 @@ static struct ggml_cgraph * llama_build_graph(
9785
10716
  {
9786
10717
  result = llm.build_command_r();
9787
10718
  } break;
10719
+ case LLM_ARCH_DBRX:
10720
+ {
10721
+ result = llm.build_dbrx();
10722
+ } break;
10723
+ case LLM_ARCH_OLMO:
10724
+ {
10725
+ result = llm.build_olmo();
10726
+ } break;
9788
10727
  default:
9789
10728
  GGML_ASSERT(false);
9790
10729
  }
@@ -12556,16 +13495,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
12556
13495
  GGML_ASSERT(ctx);
12557
13496
  const int64_t t_start_sample_us = ggml_time_us();
12558
13497
 
12559
- bool allow_eos = false;
13498
+ bool allow_eog = false;
12560
13499
  for (const auto & stack : grammar->stacks) {
12561
13500
  if (stack.empty()) {
12562
- allow_eos = true;
13501
+ allow_eog = true;
12563
13502
  break;
12564
13503
  }
12565
13504
  }
12566
13505
 
12567
- const llama_token eos = llama_token_eos(&ctx->model);
12568
-
12569
13506
  std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
12570
13507
  candidates_decoded.reserve(candidates->size);
12571
13508
  std::vector<llama_grammar_candidate> candidates_grammar;
@@ -12573,9 +13510,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
12573
13510
 
12574
13511
  for (size_t i = 0; i < candidates->size; ++i) {
12575
13512
  const llama_token id = candidates->data[i].id;
12576
- const std::string piece = llama_token_to_piece(ctx, id);
12577
- if (id == eos) {
12578
- if (!allow_eos) {
13513
+ const std::string piece = llama_token_to_piece(ctx, id, false);
13514
+
13515
+ if (llama_token_is_eog(&ctx->model, id)) {
13516
+ if (!allow_eog) {
12579
13517
  candidates->data[i].logit = -INFINITY;
12580
13518
  }
12581
13519
  } else if (piece.empty() || piece[0] == 0) {
@@ -12738,7 +13676,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
12738
13676
  return result;
12739
13677
  }
12740
13678
 
12741
- llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13679
+ llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
12742
13680
  GGML_ASSERT(ctx);
12743
13681
 
12744
13682
  const int64_t t_start_sample_us = ggml_time_us();
@@ -12751,7 +13689,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
12751
13689
  }
12752
13690
 
12753
13691
  std::discrete_distribution<> dist(probs.begin(), probs.end());
12754
- auto & rng = ctx->rng;
12755
13692
  int idx = dist(rng);
12756
13693
 
12757
13694
  llama_token result = candidates->data[idx].id;
@@ -12761,10 +13698,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
12761
13698
  return result;
12762
13699
  }
12763
13700
 
13701
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
13702
+ return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
13703
+ }
13704
+
12764
13705
  void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
12765
13706
  const int64_t t_start_sample_us = ggml_time_us();
12766
13707
 
12767
- if (token == llama_token_eos(&ctx->model)) {
13708
+ if (llama_token_is_eog(&ctx->model, token)) {
12768
13709
  for (const auto & stack : grammar->stacks) {
12769
13710
  if (stack.empty()) {
12770
13711
  return;
@@ -12773,7 +13714,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
12773
13714
  GGML_ASSERT(false);
12774
13715
  }
12775
13716
 
12776
- const std::string piece = llama_token_to_piece(ctx, token);
13717
+ const std::string piece = llama_token_to_piece(ctx, token, false);
12777
13718
 
12778
13719
  // Note terminating 0 in decoded string
12779
13720
  const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -12915,6 +13856,11 @@ struct llama_beam_search_data {
12915
13856
  }
12916
13857
  llama_logit_info logit_info(ctx);
12917
13858
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13859
+
13860
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13861
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13862
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13863
+
12918
13864
  size_t i=0;
12919
13865
  if (next_beams.size() < n_beams) {
12920
13866
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13535,6 +14481,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13535
14481
  gguf_set_kv (ctx_out, ml.meta);
13536
14482
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13537
14483
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14484
+ // Remove split metadata
14485
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14486
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14487
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13538
14488
 
13539
14489
  if (params->kv_overrides) {
13540
14490
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -13587,26 +14537,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13587
14537
  std::vector<no_init<uint8_t>> work;
13588
14538
  std::vector<no_init<float>> f32_conv_buf;
13589
14539
 
14540
+ uint16_t n_split = 1;
14541
+ // Assume split index is continuous
14542
+ if (params->keep_split) {
14543
+ for (int i = 0; i < ml.n_tensors; ++i) {
14544
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
14545
+ }
14546
+ }
14547
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
14548
+ ctx_outs[0] = ctx_out;
14549
+
13590
14550
  // populate the original tensors so we get an initial meta data
13591
14551
  for (int i = 0; i < ml.n_tensors; ++i) {
13592
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
13593
- gguf_add_tensor(ctx_out, meta);
14552
+ auto weight = ml.get_weight(i);
14553
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
14554
+ struct ggml_tensor * tensor = weight->tensor;
14555
+ if (ctx_outs[i_split] == NULL) {
14556
+ ctx_outs[i_split] = gguf_init_empty();
14557
+ }
14558
+ gguf_add_tensor(ctx_outs[i_split], tensor);
13594
14559
  }
13595
14560
 
13596
- std::ofstream fout(fname_out, std::ios::binary);
13597
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
13598
-
13599
- const size_t meta_size = gguf_get_meta_size(ctx_out);
14561
+ // Set split info if needed
14562
+ if (n_split > 1) {
14563
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
14564
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
14565
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
14566
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
14567
+ }
14568
+ }
13600
14569
 
13601
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
14570
+ int cur_split = -1;
14571
+ std::ofstream fout;
14572
+ auto close_ofstream = [&]() {
14573
+ // Write metadata and close file handler
14574
+ if (fout.is_open()) {
14575
+ fout.seekp(0);
14576
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
14577
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
14578
+ fout.write((const char *) data.data(), data.size());
14579
+ fout.close();
14580
+ }
14581
+ };
14582
+ auto new_ofstream = [&](int index) {
14583
+ cur_split = index;
14584
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
14585
+ std::string fname = fname_out;
14586
+ if (params->keep_split) {
14587
+ char split_path[PATH_MAX] = {0};
14588
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
14589
+ fname = std::string(split_path);
14590
+ }
13602
14591
 
13603
- // placeholder for the meta data
13604
- ::zeros(fout, meta_size);
14592
+ fout = std::ofstream(fname, std::ios::binary);
14593
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
14594
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
14595
+ // placeholder for the meta data
14596
+ ::zeros(fout, meta_size);
14597
+ };
13605
14598
 
13606
14599
  const auto tn = LLM_TN(model.arch);
13607
-
14600
+ new_ofstream(0);
13608
14601
  for (int i = 0; i < ml.n_tensors; ++i) {
13609
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
14602
+ auto weight = ml.get_weight(i);
14603
+ struct ggml_tensor * tensor = weight->tensor;
14604
+ if (weight->idx != cur_split && params->keep_split) {
14605
+ close_ofstream();
14606
+ new_ofstream(weight->idx);
14607
+ }
13610
14608
 
13611
14609
  const std::string name = ggml_get_name(tensor);
13612
14610
 
@@ -13761,26 +14759,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13761
14759
  total_size_new += new_size;
13762
14760
 
13763
14761
  // update the gguf meta data as we go
13764
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
13765
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
14762
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
14763
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
13766
14764
 
13767
14765
  // write tensor data + padding
13768
14766
  fout.write((const char *) new_data, new_size);
13769
14767
  zeros(fout, GGML_PAD(new_size, align) - new_size);
13770
14768
  }
13771
-
13772
- // go back to beginning of file and write the updated meta data
13773
- {
13774
- fout.seekp(0);
13775
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
13776
- gguf_get_meta_data(ctx_out, data.data());
13777
- fout.write((const char *) data.data(), data.size());
14769
+ close_ofstream();
14770
+ for (auto & c:ctx_outs) {
14771
+ gguf_free(c);
13778
14772
  }
13779
14773
 
13780
- fout.close();
13781
-
13782
- gguf_free(ctx_out);
13783
-
13784
14774
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
13785
14775
  LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
13786
14776
 
@@ -14136,6 +15126,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
14136
15126
  /*.quantize_output_tensor =*/ true,
14137
15127
  /*.only_copy =*/ false,
14138
15128
  /*.pure =*/ false,
15129
+ /*.keep_split =*/ false,
14139
15130
  /*.imatrix =*/ nullptr,
14140
15131
  /*.kv_overrides =*/ nullptr,
14141
15132
  };
@@ -14629,18 +15620,22 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14629
15620
  case LLM_ARCH_MINICPM:
14630
15621
  case LLM_ARCH_XVERSE:
14631
15622
  case LLM_ARCH_COMMAND_R:
15623
+ case LLM_ARCH_OLMO:
14632
15624
  return LLAMA_ROPE_TYPE_NORM;
14633
15625
 
14634
15626
  // the pairs of head values are offset by n_rot/2
14635
15627
  case LLM_ARCH_FALCON:
14636
15628
  case LLM_ARCH_GROK:
15629
+ case LLM_ARCH_DBRX:
14637
15630
  case LLM_ARCH_PERSIMMON:
14638
15631
  case LLM_ARCH_BERT:
14639
15632
  case LLM_ARCH_NOMIC_BERT:
14640
15633
  case LLM_ARCH_STABLELM:
14641
15634
  case LLM_ARCH_QWEN:
14642
15635
  case LLM_ARCH_QWEN2:
15636
+ case LLM_ARCH_QWEN2MOE:
14643
15637
  case LLM_ARCH_PHI2:
15638
+ case LLM_ARCH_PHI3:
14644
15639
  case LLM_ARCH_GEMMA:
14645
15640
  case LLM_ARCH_STARCODER2:
14646
15641
  return LLAMA_ROPE_TYPE_NEOX;
@@ -14654,6 +15649,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14654
15649
  return LLAMA_ROPE_TYPE_NONE;
14655
15650
  }
14656
15651
 
15652
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
15653
+ return ctx->cparams.pooling_type;
15654
+ }
15655
+
14657
15656
  int32_t llama_n_vocab(const struct llama_model * model) {
14658
15657
  return model->hparams.n_vocab;
14659
15658
  }
@@ -15132,6 +16131,8 @@ struct llama_data_file_context : llama_data_context {
15132
16131
  *
15133
16132
  */
15134
16133
  static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
16134
+ llama_synchronize(ctx);
16135
+
15135
16136
  // copy rng
15136
16137
  {
15137
16138
  std::ostringstream rng_ss;
@@ -15284,6 +16285,8 @@ size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
15284
16285
 
15285
16286
  // Sets the state reading from the specified source address
15286
16287
  size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
16288
+ llama_synchronize(ctx);
16289
+
15287
16290
  const uint8_t * inp = src;
15288
16291
 
15289
16292
  // set rng
@@ -15320,6 +16323,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15320
16323
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15321
16324
  ctx->output_ids[id] = i;
15322
16325
  }
16326
+
16327
+ ctx->n_outputs = n_outputs;
15323
16328
  }
15324
16329
  }
15325
16330
 
@@ -15586,6 +16591,8 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
15586
16591
  }
15587
16592
 
15588
16593
  static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
16594
+ llama_synchronize(ctx);
16595
+
15589
16596
  const auto & kv_self = ctx->kv_self;
15590
16597
  GGML_ASSERT(!kv_self.recurrent); // not implemented
15591
16598
 
@@ -15703,6 +16710,8 @@ size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_s
15703
16710
  }
15704
16711
 
15705
16712
  size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
16713
+ llama_synchronize(ctx);
16714
+
15706
16715
  auto & kv_self = ctx->kv_self;
15707
16716
  GGML_ASSERT(!kv_self.recurrent); // not implemented
15708
16717
 
@@ -16154,6 +17163,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
16154
17163
  return model->vocab.id_to_token[token].type;
16155
17164
  }
16156
17165
 
17166
+ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
17167
+ return token != -1 && (
17168
+ token == llama_token_eos(model) ||
17169
+ token == llama_token_eot(model)
17170
+ );
17171
+ }
17172
+
16157
17173
  llama_token llama_token_bos(const struct llama_model * model) {
16158
17174
  return model->vocab.special_bos_id;
16159
17175
  }
@@ -16231,7 +17247,7 @@ static std::string llama_decode_text(const std::string & text) {
16231
17247
  }
16232
17248
 
16233
17249
  // does not write null-terminator to buf
16234
- int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
17250
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
16235
17251
  if (0 <= token && token < llama_n_vocab(model)) {
16236
17252
  switch (llama_vocab_get_type(model->vocab)) {
16237
17253
  case LLAMA_VOCAB_TYPE_WPM:
@@ -16246,7 +17262,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16246
17262
  }
16247
17263
  memcpy(buf, result.c_str(), result.length());
16248
17264
  return result.length();
16249
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17265
+ } else if (
17266
+ (llama_is_user_defined_token(model->vocab, token)) ||
17267
+ (llama_is_control_token (model->vocab, token) && special)) {
16250
17268
  std::string result = model->vocab.id_to_token[token].text;
16251
17269
  if (length < (int) result.length()) {
16252
17270
  return -(int) result.length();
@@ -16259,8 +17277,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16259
17277
  }
16260
17278
  memcpy(buf, "\xe2\x96\x85", 3);
16261
17279
  return 3;
16262
- } else if (llama_is_control_token(model->vocab, token)) {
16263
- ;
16264
17280
  } else if (llama_is_byte_token(model->vocab, token)) {
16265
17281
  if (length < 1) {
16266
17282
  return -1;
@@ -16281,15 +17297,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
16281
17297
  }
16282
17298
  memcpy(buf, result.c_str(), result.length());
16283
17299
  return result.length();
16284
- } else if (llama_is_user_defined_token(model->vocab, token)) {
17300
+ } else if (
17301
+ (llama_is_user_defined_token(model->vocab, token)) ||
17302
+ (llama_is_control_token (model->vocab, token) && special)) {
16285
17303
  std::string result = model->vocab.id_to_token[token].text;
16286
17304
  if (length < (int) result.length()) {
16287
17305
  return -(int) result.length();
16288
17306
  }
16289
17307
  memcpy(buf, result.c_str(), result.length());
16290
17308
  return result.length();
16291
- } else if (llama_is_control_token(model->vocab, token)) {
16292
- ;
16293
17309
  }
16294
17310
  break;
16295
17311
  }
@@ -16472,6 +17488,39 @@ static int32_t llama_chat_apply_template_internal(
16472
17488
  if (add_ass) {
16473
17489
  ss << "### Response:\n";
16474
17490
  }
17491
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17492
+ // CohereForAI/c4ai-command-r-plus
17493
+ for (auto message : chat) {
17494
+ std::string role(message->role);
17495
+ if (role == "system") {
17496
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17497
+ } else if (role == "user") {
17498
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17499
+ } else if (role == "assistant") {
17500
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17501
+ }
17502
+ }
17503
+ if (add_ass) {
17504
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17505
+ }
17506
+ } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
17507
+ // Llama 3
17508
+ for (auto message : chat) {
17509
+ std::string role(message->role);
17510
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
17511
+ }
17512
+ if (add_ass) {
17513
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
17514
+ }
17515
+ } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
17516
+ // Phi 3
17517
+ for (auto message : chat) {
17518
+ std::string role(message->role);
17519
+ ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
17520
+ }
17521
+ if (add_ass) {
17522
+ ss << "<|assistant|>\n";
17523
+ }
16475
17524
  } else {
16476
17525
  // template not supported
16477
17526
  return -1;
@@ -16604,6 +17653,11 @@ const char * llama_print_system_info(void) {
16604
17653
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
16605
17654
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
16606
17655
  s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
17656
+ #ifdef GGML_USE_LLAMAFILE
17657
+ s += "LAMMAFILE = 1 | ";
17658
+ #else
17659
+ s += "LAMMAFILE = 0 | ";
17660
+ #endif
16607
17661
 
16608
17662
  return s.c_str();
16609
17663
  }