llama_cpp 0.14.5 → 0.14.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,6 +209,7 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
213
214
  LLM_ARCH_PLAMO,
214
215
  LLM_ARCH_CODESHELL,
@@ -220,6 +221,8 @@ enum llm_arch {
220
221
  LLM_ARCH_MAMBA,
221
222
  LLM_ARCH_XVERSE,
222
223
  LLM_ARCH_COMMAND_R,
224
+ LLM_ARCH_DBRX,
225
+ LLM_ARCH_OLMO,
223
226
  LLM_ARCH_UNKNOWN,
224
227
  };
225
228
 
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
244
  { LLM_ARCH_STABLELM, "stablelm" },
242
245
  { LLM_ARCH_QWEN, "qwen" },
243
246
  { LLM_ARCH_QWEN2, "qwen2" },
247
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
248
  { LLM_ARCH_PHI2, "phi2" },
245
249
  { LLM_ARCH_PLAMO, "plamo" },
246
250
  { LLM_ARCH_CODESHELL, "codeshell" },
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
256
  { LLM_ARCH_MAMBA, "mamba" },
253
257
  { LLM_ARCH_XVERSE, "xverse" },
254
258
  { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
255
261
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
262
  };
257
263
 
@@ -325,6 +331,10 @@ enum llm_kv {
325
331
  LLM_KV_TOKENIZER_ADD_PREFIX,
326
332
  LLM_KV_TOKENIZER_HF_JSON,
327
333
  LLM_KV_TOKENIZER_RWKV,
334
+ LLM_KV_TOKENIZER_PREFIX_ID,
335
+ LLM_KV_TOKENIZER_SUFFIX_ID,
336
+ LLM_KV_TOKENIZER_MIDDLE_ID,
337
+ LLM_KV_TOKENIZER_EOT_ID,
328
338
  };
329
339
 
330
340
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -397,6 +407,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
397
407
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
398
408
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
399
409
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
410
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
411
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
412
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
413
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
400
414
  };
401
415
 
402
416
  struct LLM_KV {
@@ -427,6 +441,7 @@ enum llm_tensor {
427
441
  LLM_TENSOR_ATTN_OUT_NORM,
428
442
  LLM_TENSOR_ATTN_ROT_EMBD,
429
443
  LLM_TENSOR_FFN_GATE_INP,
444
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
430
445
  LLM_TENSOR_FFN_NORM,
431
446
  LLM_TENSOR_FFN_GATE,
432
447
  LLM_TENSOR_FFN_DOWN,
@@ -438,6 +453,9 @@ enum llm_tensor {
438
453
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
439
454
  LLM_TENSOR_FFN_GATE_EXPS,
440
455
  LLM_TENSOR_FFN_UP_EXPS,
456
+ LLM_TENSOR_FFN_DOWN_SHEXP,
457
+ LLM_TENSOR_FFN_GATE_SHEXP,
458
+ LLM_TENSOR_FFN_UP_SHEXP,
441
459
  LLM_TENSOR_ATTN_Q_NORM,
442
460
  LLM_TENSOR_ATTN_K_NORM,
443
461
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -700,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
700
718
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
701
719
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
702
720
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
721
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
722
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
703
723
  },
704
724
  },
705
725
  {
@@ -735,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
735
755
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
736
756
  },
737
757
  },
758
+ {
759
+ LLM_ARCH_QWEN2MOE,
760
+ {
761
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
762
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
763
+ { LLM_TENSOR_OUTPUT, "output" },
764
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
765
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
766
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
767
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
768
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
769
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
770
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
771
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
772
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
773
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
774
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
775
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
776
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
777
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
778
+ },
779
+ },
738
780
  {
739
781
  LLM_ARCH_PHI2,
740
782
  {
@@ -934,6 +976,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
934
976
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
935
977
  },
936
978
  },
979
+ {
980
+ LLM_ARCH_DBRX,
981
+ {
982
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
983
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
984
+ { LLM_TENSOR_OUTPUT, "output" },
985
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
986
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
987
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
988
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
989
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
990
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
991
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
992
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
993
+ },
994
+ },
995
+ {
996
+ LLM_ARCH_OLMO,
997
+ {
998
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
999
+ { LLM_TENSOR_OUTPUT, "output" },
1000
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1001
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1002
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1003
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1004
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1005
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1006
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1007
+ },
1008
+ },
937
1009
  {
938
1010
  LLM_ARCH_UNKNOWN,
939
1011
  {
@@ -1690,6 +1762,7 @@ enum e_model {
1690
1762
  MODEL_4B,
1691
1763
  MODEL_7B,
1692
1764
  MODEL_8B,
1765
+ MODEL_12B,
1693
1766
  MODEL_13B,
1694
1767
  MODEL_14B,
1695
1768
  MODEL_15B,
@@ -1705,8 +1778,10 @@ enum e_model {
1705
1778
  MODEL_MEDIUM,
1706
1779
  MODEL_LARGE,
1707
1780
  MODEL_XL,
1781
+ MODEL_A2_7B,
1708
1782
  MODEL_8x7B,
1709
1783
  MODEL_8x22B,
1784
+ MODEL_16x12B,
1710
1785
  };
1711
1786
 
1712
1787
  static const size_t kiB = 1024;
@@ -1890,6 +1965,12 @@ struct llama_layer {
1890
1965
  struct ggml_tensor * ffn_down_exps;
1891
1966
  struct ggml_tensor * ffn_up_exps ;
1892
1967
 
1968
+ // ff shared expert (shexp)
1969
+ struct ggml_tensor * ffn_gate_inp_shexp;
1970
+ struct ggml_tensor * ffn_gate_shexp;
1971
+ struct ggml_tensor * ffn_down_shexp;
1972
+ struct ggml_tensor * ffn_up_shexp;
1973
+
1893
1974
  // ff bias
1894
1975
  struct ggml_tensor * ffn_down_b; // b2
1895
1976
  struct ggml_tensor * ffn_up_b; // b3
@@ -2036,10 +2117,10 @@ struct llama_vocab {
2036
2117
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2037
2118
 
2038
2119
  id linefeed_id = 13;
2039
- id special_prefix_id = 32007;
2040
- id special_middle_id = 32009;
2041
- id special_suffix_id = 32008;
2042
- id special_eot_id = 32010;
2120
+ id special_prefix_id = -1;
2121
+ id special_suffix_id = -1;
2122
+ id special_middle_id = -1;
2123
+ id special_eot_id = -1;
2043
2124
 
2044
2125
  bool add_space_prefix = true;
2045
2126
 
@@ -3545,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
3545
3626
  case MODEL_3B: return "3B";
3546
3627
  case MODEL_7B: return "7B";
3547
3628
  case MODEL_8B: return "8B";
3629
+ case MODEL_12B: return "12B";
3548
3630
  case MODEL_13B: return "13B";
3549
3631
  case MODEL_14B: return "14B";
3550
3632
  case MODEL_15B: return "15B";
@@ -3560,8 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
3560
3642
  case MODEL_MEDIUM: return "0.4B";
3561
3643
  case MODEL_LARGE: return "0.8B";
3562
3644
  case MODEL_XL: return "1.5B";
3645
+ case MODEL_A2_7B: return "A2.7B";
3563
3646
  case MODEL_8x7B: return "8x7B";
3564
3647
  case MODEL_8x22B: return "8x22B";
3648
+ case MODEL_16x12B: return "16x12B";
3565
3649
  default: return "?B";
3566
3650
  }
3567
3651
  }
@@ -3834,6 +3918,7 @@ static void llm_load_hparams(
3834
3918
  switch (hparams.n_layer) {
3835
3919
  case 24: model.type = e_model::MODEL_1B; break;
3836
3920
  case 32: model.type = e_model::MODEL_3B; break;
3921
+ case 40: model.type = e_model::MODEL_12B; break;
3837
3922
  default: model.type = e_model::MODEL_UNKNOWN;
3838
3923
  }
3839
3924
  } break;
@@ -3858,6 +3943,14 @@ static void llm_load_hparams(
3858
3943
  default: model.type = e_model::MODEL_UNKNOWN;
3859
3944
  }
3860
3945
  } break;
3946
+ case LLM_ARCH_QWEN2MOE:
3947
+ {
3948
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3949
+ switch (hparams.n_layer) {
3950
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3951
+ default: model.type = e_model::MODEL_UNKNOWN;
3952
+ }
3953
+ } break;
3861
3954
  case LLM_ARCH_PHI2:
3862
3955
  {
3863
3956
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3983,6 +4076,28 @@ static void llm_load_hparams(
3983
4076
  default: model.type = e_model::MODEL_UNKNOWN;
3984
4077
  }
3985
4078
  } break;
4079
+ case LLM_ARCH_DBRX:
4080
+ {
4081
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4082
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4083
+
4084
+ switch (hparams.n_layer) {
4085
+ case 40: model.type = e_model::MODEL_16x12B; break;
4086
+ default: model.type = e_model::MODEL_UNKNOWN;
4087
+ }
4088
+ } break;
4089
+ case LLM_ARCH_OLMO:
4090
+ {
4091
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4092
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4093
+
4094
+ switch (hparams.n_layer) {
4095
+ case 22: model.type = e_model::MODEL_1B; break;
4096
+ case 32: model.type = e_model::MODEL_7B; break;
4097
+ case 80: model.type = e_model::MODEL_70B; break;
4098
+ default: model.type = e_model::MODEL_UNKNOWN;
4099
+ }
4100
+ } break;
3986
4101
  default: (void)0;
3987
4102
  }
3988
4103
 
@@ -4042,6 +4157,32 @@ static void llm_load_vocab(
4042
4157
  vocab.special_cls_id = -1;
4043
4158
  vocab.special_mask_id = -1;
4044
4159
 
4160
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4161
+ // prior to support of FIM special tokens in GGUF, the following
4162
+ // will allow those models to continue to work. The general names
4163
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4164
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4165
+ // new versions of these models have been published.
4166
+ std::string gen_name;
4167
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4168
+
4169
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4170
+ [](unsigned char c){ return std::tolower(c); });
4171
+
4172
+ if (gen_name.find("code") != std::string::npos) {
4173
+ if (model.arch == LLM_ARCH_LLAMA) {
4174
+ vocab.special_prefix_id = 32007;
4175
+ vocab.special_suffix_id = 32008;
4176
+ vocab.special_middle_id = 32009;
4177
+ vocab.special_eot_id = 32010;
4178
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4179
+ vocab.special_prefix_id = 67;
4180
+ vocab.special_suffix_id = 69;
4181
+ vocab.special_middle_id = 68;
4182
+ vocab.special_eot_id = 70;
4183
+ }
4184
+ }
4185
+
4045
4186
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4046
4187
  if (add_space_prefix_keyidx != -1) {
4047
4188
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4155,13 +4296,17 @@ static void llm_load_vocab(
4155
4296
  // special tokens
4156
4297
  {
4157
4298
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4158
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4299
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4300
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4301
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4302
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4303
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4304
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4305
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4306
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4307
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4308
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4165
4310
  };
4166
4311
  for (const auto & it : special_token_types) {
4167
4312
  const std::string & key = kv(std::get<0>(it));
@@ -4378,6 +4523,13 @@ static bool llm_load_tensors(
4378
4523
 
4379
4524
  auto & hparams = model.hparams;
4380
4525
 
4526
+ #ifdef GGML_USE_SYCL
4527
+ // disable MoE with SYCL until mul_mat_id is updated
4528
+ if (hparams.n_expert > 0) {
4529
+ n_gpu_layers = 0;
4530
+ }
4531
+ #endif
4532
+
4381
4533
  model.split_mode = split_mode;
4382
4534
  model.main_gpu = main_gpu;
4383
4535
  model.n_gpu_layers = n_gpu_layers;
@@ -4475,7 +4627,7 @@ static bool llm_load_tensors(
4475
4627
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4476
4628
 
4477
4629
  // for moe merged tensors
4478
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4630
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4479
4631
 
4480
4632
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4481
4633
  for (auto & it : buft_layer_count) {
@@ -4671,6 +4823,39 @@ static bool llm_load_tensors(
4671
4823
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4672
4824
  }
4673
4825
  } break;
4826
+ case LLM_ARCH_DBRX:
4827
+ {
4828
+ if (n_expert == 0) {
4829
+ throw std::runtime_error("DBRX model cannot have zero experts");
4830
+ }
4831
+
4832
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+
4834
+ // output
4835
+ {
4836
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4837
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4838
+ }
4839
+
4840
+ for (int i = 0; i < n_layer; ++i) {
4841
+ ggml_context * ctx_layer = ctx_for_layer(i);
4842
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4843
+
4844
+ auto & layer = model.layers[i];
4845
+
4846
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4847
+
4848
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4849
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4850
+
4851
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4852
+
4853
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4854
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4855
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4856
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4857
+ }
4858
+ } break;
4674
4859
  case LLM_ARCH_BAICHUAN:
4675
4860
  {
4676
4861
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4985,8 +5170,13 @@ static bool llm_load_tensors(
4985
5170
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4986
5171
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4987
5172
 
4988
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4989
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5173
+ // optional q and k layernorms, present in StableLM 2 12B
5174
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5175
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5176
+
5177
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5178
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5179
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4990
5180
 
4991
5181
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4992
5182
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5029,7 +5219,13 @@ static bool llm_load_tensors(
5029
5219
  // output
5030
5220
  {
5031
5221
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5032
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5222
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5223
+ // if output is NULL, init from the input tok embed
5224
+ if (model.output == NULL) {
5225
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5226
+ ml.n_created--; // artificial tensor
5227
+ ml.size_data += ggml_nbytes(model.output);
5228
+ }
5033
5229
  }
5034
5230
 
5035
5231
  for (int i = 0; i < n_layer; ++i) {
@@ -5057,6 +5253,54 @@ static bool llm_load_tensors(
5057
5253
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5058
5254
  }
5059
5255
  } break;
5256
+ case LLM_ARCH_QWEN2MOE:
5257
+ {
5258
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5259
+
5260
+ // output
5261
+ {
5262
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5263
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5264
+ }
5265
+
5266
+ for (int i = 0; i < n_layer; ++i) {
5267
+ ggml_context * ctx_layer = ctx_for_layer(i);
5268
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5269
+
5270
+ auto & layer = model.layers[i];
5271
+
5272
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5273
+
5274
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5275
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5276
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5277
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5278
+
5279
+ // optional bias tensors
5280
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5281
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5282
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5283
+
5284
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5285
+
5286
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5287
+
5288
+ GGML_ASSERT(hparams.n_expert > 0);
5289
+ GGML_ASSERT(hparams.n_expert_used > 0);
5290
+
5291
+ // MoE branch
5292
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5293
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5294
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5295
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5296
+
5297
+ // Shared expert branch
5298
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5299
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5300
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5301
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5302
+ }
5303
+ } break;
5060
5304
  case LLM_ARCH_PHI2:
5061
5305
  {
5062
5306
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5450,6 +5694,37 @@ static bool llm_load_tensors(
5450
5694
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5451
5695
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5452
5696
 
5697
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5698
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5699
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5700
+ }
5701
+ } break;
5702
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5703
+ {
5704
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5705
+
5706
+ // output
5707
+ {
5708
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5709
+ // if output is NULL, init from the input tok embed
5710
+ if (model.output == NULL) {
5711
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5712
+ ml.n_created--; // artificial tensor
5713
+ ml.size_data += ggml_nbytes(model.output);
5714
+ }
5715
+ }
5716
+
5717
+ for (int i = 0; i < n_layer; ++i) {
5718
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5719
+
5720
+ auto & layer = model.layers[i];
5721
+
5722
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5723
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5724
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5725
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5726
+
5727
+
5453
5728
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5454
5729
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5455
5730
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5890,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
5890
6165
  return cur;
5891
6166
  }
5892
6167
 
6168
+ static struct ggml_tensor * llm_build_moe_ffn(
6169
+ struct ggml_context * ctx,
6170
+ struct ggml_tensor * cur,
6171
+ struct ggml_tensor * gate_inp,
6172
+ struct ggml_tensor * up_exps,
6173
+ struct ggml_tensor * gate_exps,
6174
+ struct ggml_tensor * down_exps,
6175
+ int64_t n_expert,
6176
+ int64_t n_expert_used,
6177
+ llm_ffn_op_type type_op,
6178
+ bool norm_w,
6179
+ const llm_build_cb & cb,
6180
+ int il) {
6181
+ int64_t n_embd = cur->ne[0];
6182
+ int64_t n_tokens = cur->ne[1];
6183
+
6184
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6185
+ cb(logits, "ffn_moe_logits", il);
6186
+
6187
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6188
+ cb(probs, "ffn_moe_probs", il);
6189
+
6190
+ // select experts
6191
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6192
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6193
+ cb(selected_experts, "ffn_moe_topk", il);
6194
+
6195
+ ggml_tensor * weights = ggml_get_rows(ctx,
6196
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6197
+ cb(weights, "ffn_moe_weights", il);
6198
+
6199
+ if (norm_w) {
6200
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6201
+
6202
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6203
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6204
+
6205
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6206
+ cb(weights, "ffn_moe_weights_norm", il);
6207
+
6208
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6209
+ }
6210
+
6211
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6212
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6213
+ cb(up, "ffn_moe_up", il);
6214
+
6215
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6216
+ cb(gate, "ffn_moe_gate", il);
6217
+
6218
+ switch (type_op) {
6219
+ case LLM_FFN_SILU:
6220
+ {
6221
+ gate = ggml_silu(ctx, gate);
6222
+ cb(gate, "ffn_moe_silu", il);
6223
+ } break;
6224
+ case LLM_FFN_GELU:
6225
+ {
6226
+ gate = ggml_gelu(ctx, gate);
6227
+ cb(gate, "ffn_moe_gelu", il);
6228
+ } break;
6229
+ default:
6230
+ GGML_ASSERT(false);
6231
+ }
6232
+
6233
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6234
+ cb(par, "ffn_moe_gate_par", il);
6235
+
6236
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6237
+ cb(experts, "ffn_moe_down", il);
6238
+
6239
+ experts = ggml_mul(ctx, experts, weights);
6240
+
6241
+ // aggregate experts
6242
+ ggml_tensor * moe_out = nullptr;
6243
+ for (int i = 0; i < n_expert_used; ++i) {
6244
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6245
+ experts->nb[2], i*experts->nb[1]);
6246
+
6247
+ if (i == 0) {
6248
+ moe_out = cur_expert;
6249
+ } else {
6250
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6251
+ }
6252
+ }
6253
+
6254
+ if (n_expert_used == 1) {
6255
+ // avoid returning a non-contiguous tensor
6256
+ moe_out = ggml_cont(ctx, moe_out);
6257
+ }
6258
+
6259
+ return moe_out;
6260
+ }
6261
+
5893
6262
  // if max_alibi_bias > 0 then apply ALiBi
5894
6263
  static struct ggml_tensor * llm_build_kqv(
5895
6264
  struct ggml_context * ctx,
@@ -6433,62 +6802,15 @@ struct llm_build_context {
6433
6802
  LLM_NORM_RMS, cb, il);
6434
6803
  cb(cur, "ffn_norm", il);
6435
6804
 
6436
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6437
- cb(logits, "ffn_moe_logits", il);
6438
-
6439
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6440
- cb(probs, "ffn_moe_probs", il);
6441
-
6442
- // select experts
6443
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6444
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6445
-
6446
- ggml_tensor * weights = ggml_get_rows(ctx0,
6447
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6448
- cb(weights, "ffn_moe_weights", il);
6449
-
6450
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6451
-
6452
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6453
- cb(weights_sum, "ffn_moe_weights_sum", il);
6454
-
6455
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6456
- cb(weights, "ffn_moe_weights_norm", il);
6457
-
6458
- // compute expert outputs
6459
- ggml_tensor * moe_out = nullptr;
6460
-
6461
- for (int i = 0; i < n_expert_used; ++i) {
6462
- ggml_tensor * cur_expert;
6463
-
6464
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6465
- cb(cur_up, "ffn_moe_up", il);
6466
-
6467
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6468
- cb(cur_gate, "ffn_moe_gate", il);
6469
-
6470
- cur_gate = ggml_silu(ctx0, cur_gate);
6471
- cb(cur_gate, "ffn_moe_silu", il);
6472
-
6473
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6474
- cb(cur_expert, "ffn_moe_gate_par", il);
6475
-
6476
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6477
- cb(cur_expert, "ffn_moe_down", il);
6478
-
6479
- cur_expert = ggml_mul(ctx0, cur_expert,
6480
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6481
- cb(cur_expert, "ffn_moe_weighted", il);
6482
-
6483
- if (i == 0) {
6484
- moe_out = cur_expert;
6485
- } else {
6486
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6487
- cb(moe_out, "ffn_moe_out", il);
6488
- }
6489
- }
6490
-
6491
- cur = moe_out;
6805
+ cur = llm_build_moe_ffn(ctx0, cur,
6806
+ model.layers[il].ffn_gate_inp,
6807
+ model.layers[il].ffn_up_exps,
6808
+ model.layers[il].ffn_gate_exps,
6809
+ model.layers[il].ffn_down_exps,
6810
+ n_expert, n_expert_used,
6811
+ LLM_FFN_SILU, true,
6812
+ cb, il);
6813
+ cb(cur, "ffn_moe_out", il);
6492
6814
  }
6493
6815
 
6494
6816
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6967,74 +7289,158 @@ struct llm_build_context {
6967
7289
  LLM_NORM_RMS, cb, il);
6968
7290
  cb(cur, "ffn_norm", il);
6969
7291
 
6970
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6971
- cb(logits, "ffn_moe_logits", il);
7292
+ cur = llm_build_moe_ffn(ctx0, cur,
7293
+ model.layers[il].ffn_gate_inp,
7294
+ model.layers[il].ffn_up_exps,
7295
+ model.layers[il].ffn_gate_exps,
7296
+ model.layers[il].ffn_down_exps,
7297
+ n_expert, n_expert_used,
7298
+ LLM_FFN_GELU, true,
7299
+ cb, il);
7300
+ cb(cur, "ffn_moe_out", il);
7301
+
7302
+ // Grok
7303
+ // if layer_out_norm is present then apply it before adding the input
7304
+ // Idea: maybe ffn_out_norm is a better name
7305
+ if (model.layers[il].layer_out_norm) {
7306
+ cur = llm_build_norm(ctx0, cur, hparams,
7307
+ model.layers[il].layer_out_norm, NULL,
7308
+ LLM_NORM_RMS, cb, il);
7309
+ cb(cur, "layer_out_norm", il);
7310
+ }
7311
+
7312
+ cur = ggml_add(ctx0, cur, ffn_inp);
7313
+ cb(cur, "ffn_out", il);
7314
+
7315
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7316
+ if (layer_dir != nullptr) {
7317
+ cur = ggml_add(ctx0, cur, layer_dir);
7318
+ }
7319
+ cb(cur, "l_out", il);
7320
+
7321
+ // input for next layer
7322
+ inpL = cur;
7323
+ }
6972
7324
 
6973
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6974
- cb(probs, "ffn_moe_probs", il);
7325
+ cur = inpL;
6975
7326
 
6976
- // select experts
6977
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6978
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
7327
+ cur = llm_build_norm(ctx0, cur, hparams,
7328
+ model.output_norm, NULL,
7329
+ LLM_NORM_RMS, cb, -1);
7330
+ cb(cur, "result_norm", -1);
6979
7331
 
6980
- ggml_tensor * weights = ggml_get_rows(ctx0,
6981
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6982
- cb(weights, "ffn_moe_weights", il);
7332
+ // lm_head
7333
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6983
7334
 
6984
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
7335
+ // Grok
7336
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
6985
7337
 
6986
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6987
- cb(weights_sum, "ffn_moe_weights_sum", il);
7338
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
6988
7339
 
6989
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6990
- cb(weights, "ffn_moe_weights_norm", il);
7340
+ cb(cur, "result_output", -1);
6991
7341
 
6992
- // compute expert outputs
6993
- ggml_tensor * moe_out = nullptr;
7342
+ ggml_build_forward_expand(gf, cur);
6994
7343
 
6995
- for (int i = 0; i < n_expert_used; ++i) {
6996
- ggml_tensor * cur_expert;
7344
+ return gf;
7345
+ }
6997
7346
 
6998
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6999
- cb(cur_up, "ffn_moe_up", il);
7347
+ struct ggml_cgraph * build_dbrx() {
7348
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7000
7349
 
7001
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
7002
- cb(cur_gate, "ffn_moe_gate", il);
7350
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7351
+ int32_t n_tokens = this->n_tokens;
7003
7352
 
7004
- //GeLU
7005
- cur_gate = ggml_gelu(ctx0, cur_gate);
7006
- cb(cur_gate, "ffn_moe_gelu", il);
7353
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7354
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7355
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7356
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7007
7357
 
7008
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
7009
- cb(cur_expert, "ffn_moe_gate_par", il);
7358
+ struct ggml_tensor * cur;
7359
+ struct ggml_tensor * inpL;
7010
7360
 
7011
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
7012
- cb(cur_expert, "ffn_moe_down", il);
7361
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7013
7362
 
7014
- cur_expert = ggml_mul(ctx0, cur_expert,
7015
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
7016
- cb(cur_expert, "ffn_moe_weighted", il);
7363
+ // inp_pos - contains the positions
7364
+ struct ggml_tensor * inp_pos = build_inp_pos();
7017
7365
 
7018
- if (i == 0) {
7019
- moe_out = cur_expert;
7020
- } else {
7021
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
7022
- cb(moe_out, "ffn_moe_out", il);
7023
- }
7024
- }
7366
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7367
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7025
7368
 
7026
- cur = moe_out;
7369
+ for (int il = 0; il < n_layer; ++il) {
7370
+ struct ggml_tensor * inpSA = inpL;
7027
7371
 
7028
- // Grok
7029
- // if layer_out_norm is present then apply it before adding the input
7030
- // Idea: maybe ffn_out_norm is a better name
7031
- if (model.layers[il].layer_out_norm) {
7032
- cur = llm_build_norm(ctx0, cur, hparams,
7033
- model.layers[il].layer_out_norm, NULL,
7034
- LLM_NORM_RMS, cb, il);
7035
- cb(cur, "layer_out_norm", il);
7372
+ // norm
7373
+ cur = llm_build_norm(ctx0, inpL, hparams,
7374
+ model.layers[il].attn_norm, NULL,
7375
+ LLM_NORM, cb, il);
7376
+ cb(cur, "attn_norm", il);
7377
+
7378
+ // self-attention
7379
+ {
7380
+ struct ggml_tensor * Qcur = nullptr;
7381
+ struct ggml_tensor * Kcur = nullptr;
7382
+ struct ggml_tensor * Vcur = nullptr;
7383
+
7384
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7385
+ cb(cur, "wqkv", il);
7386
+
7387
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7388
+ cb(cur, "wqkv_clamped", il);
7389
+
7390
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7391
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7392
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7393
+
7394
+ cb(Qcur, "Qcur", il);
7395
+ cb(Kcur, "Kcur", il);
7396
+ cb(Vcur, "Vcur", il);
7397
+
7398
+ Qcur = ggml_rope_custom(
7399
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7401
+ ext_factor, attn_factor, beta_fast, beta_slow
7402
+ );
7403
+ cb(Qcur, "Qcur", il);
7404
+
7405
+ Kcur = ggml_rope_custom(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7407
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7408
+ ext_factor, attn_factor, beta_fast, beta_slow
7409
+ );
7410
+ cb(Kcur, "Kcur", il);
7411
+
7412
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
+ model.layers[il].wo, NULL,
7414
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7415
  }
7037
7416
 
7417
+ if (il == n_layer - 1) {
7418
+ // skip computing output for unused tokens
7419
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7420
+ n_tokens = n_outputs;
7421
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7422
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7423
+ }
7424
+
7425
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7426
+ cb(ffn_inp, "ffn_inp", il);
7427
+
7428
+ // feed-forward network
7429
+ // MoE branch
7430
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7431
+ model.layers[il].attn_out_norm, NULL,
7432
+ LLM_NORM, cb, il);
7433
+ cb(cur, "attn_out_norm", il);
7434
+
7435
+ cur = llm_build_moe_ffn(ctx0, cur,
7436
+ model.layers[il].ffn_gate_inp,
7437
+ model.layers[il].ffn_up_exps,
7438
+ model.layers[il].ffn_gate_exps,
7439
+ model.layers[il].ffn_down_exps,
7440
+ n_expert, n_expert_used,
7441
+ LLM_FFN_SILU, true,
7442
+ cb, il);
7443
+ cb(cur, "ffn_moe_out", il);
7038
7444
 
7039
7445
  cur = ggml_add(ctx0, cur, ffn_inp);
7040
7446
  cb(cur, "ffn_out", il);
@@ -7052,18 +7458,13 @@ struct llm_build_context {
7052
7458
  cur = inpL;
7053
7459
 
7054
7460
  cur = llm_build_norm(ctx0, cur, hparams,
7055
- model.output_norm, NULL,
7056
- LLM_NORM_RMS, cb, -1);
7461
+ model.output_norm, NULL,
7462
+ LLM_NORM, cb, -1);
7057
7463
  cb(cur, "result_norm", -1);
7058
7464
 
7059
7465
  // lm_head
7060
7466
  cur = ggml_mul_mat(ctx0, model.output, cur);
7061
7467
 
7062
- // Grok
7063
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7064
-
7065
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7066
-
7067
7468
  cb(cur, "result_output", -1);
7068
7469
 
7069
7470
  ggml_build_forward_expand(gf, cur);
@@ -7923,7 +8324,7 @@ struct llm_build_context {
7923
8324
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7924
8325
 
7925
8326
  for (int il = 0; il < n_layer; ++il) {
7926
- struct ggml_tensor * inpSA = inpL;
8327
+
7927
8328
 
7928
8329
  // norm
7929
8330
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7932,6 +8333,8 @@ struct llm_build_context {
7932
8333
  LLM_NORM, cb, il);
7933
8334
  cb(cur, "attn_norm", il);
7934
8335
 
8336
+ struct ggml_tensor * inpSA = cur;
8337
+
7935
8338
  // self-attention
7936
8339
  {
7937
8340
  // compute Q and K and RoPE them
@@ -7956,15 +8359,36 @@ struct llm_build_context {
7956
8359
  cb(Vcur, "Vcur", il);
7957
8360
  }
7958
8361
 
8362
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8363
+ cb(Qcur, "Qcur", il);
8364
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8365
+ cb(Kcur, "Kcur", il);
8366
+
8367
+ if (model.layers[il].attn_q_norm) {
8368
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8369
+ model.layers[il].attn_q_norm,
8370
+ NULL,
8371
+ LLM_NORM, cb, il);
8372
+ cb(Qcur, "Qcur", il);
8373
+ }
8374
+ if (model.layers[il].attn_k_norm) {
8375
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8376
+ model.layers[il].attn_k_norm,
8377
+ NULL,
8378
+ LLM_NORM, cb, il);
8379
+ cb(Kcur, "Kcur", il);
8380
+ }
8381
+
8382
+
7959
8383
  Qcur = ggml_rope_custom(
7960
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8384
+ ctx0, Qcur, inp_pos,
7961
8385
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7962
8386
  ext_factor, attn_factor, beta_fast, beta_slow
7963
8387
  );
7964
8388
  cb(Qcur, "Qcur", il);
7965
8389
 
7966
8390
  Kcur = ggml_rope_custom(
7967
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8391
+ ctx0, Kcur, inp_pos,
7968
8392
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7969
8393
  ext_factor, attn_factor, beta_fast, beta_slow
7970
8394
  );
@@ -7979,20 +8403,25 @@ struct llm_build_context {
7979
8403
  // skip computing output for unused tokens
7980
8404
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7981
8405
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8406
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7982
8407
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7983
8408
  }
7984
8409
 
7985
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8410
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7986
8411
  cb(ffn_inp, "ffn_inp", il);
7987
8412
 
7988
8413
  // feed-forward network
7989
8414
  {
7990
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7991
- model.layers[il].ffn_norm,
7992
- model.layers[il].ffn_norm_b,
7993
- LLM_NORM, cb, il);
7994
- cb(cur, "ffn_norm", il);
7995
-
8415
+ if (model.layers[il].ffn_norm) {
8416
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8417
+ model.layers[il].ffn_norm,
8418
+ model.layers[il].ffn_norm_b,
8419
+ LLM_NORM, cb, il);
8420
+ cb(cur, "ffn_norm", il);
8421
+ } else {
8422
+ // parallel residual
8423
+ cur = inpSA;
8424
+ }
7996
8425
  cur = llm_build_ffn(ctx0, cur,
7997
8426
  model.layers[il].ffn_up, NULL,
7998
8427
  model.layers[il].ffn_gate, NULL,
@@ -8182,12 +8611,6 @@ struct llm_build_context {
8182
8611
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8183
8612
  cb(Vcur, "Vcur", il);
8184
8613
 
8185
- // these nodes are added to the graph together so that they are not reordered
8186
- // by doing so, the number of splits in the graph is reduced
8187
- ggml_build_forward_expand(gf, Qcur);
8188
- ggml_build_forward_expand(gf, Kcur);
8189
- ggml_build_forward_expand(gf, Vcur);
8190
-
8191
8614
  Qcur = ggml_rope_custom(
8192
8615
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8193
8616
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8254,6 +8677,150 @@ struct llm_build_context {
8254
8677
  return gf;
8255
8678
  }
8256
8679
 
8680
+ struct ggml_cgraph * build_qwen2moe() {
8681
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8682
+
8683
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8684
+ int32_t n_tokens = this->n_tokens;
8685
+
8686
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8687
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8688
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8689
+
8690
+ struct ggml_tensor * cur;
8691
+ struct ggml_tensor * inpL;
8692
+
8693
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8694
+
8695
+ // inp_pos - contains the positions
8696
+ struct ggml_tensor * inp_pos = build_inp_pos();
8697
+
8698
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8699
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8700
+
8701
+ for (int il = 0; il < n_layer; ++il) {
8702
+ struct ggml_tensor * inpSA = inpL;
8703
+
8704
+ // norm
8705
+ cur = llm_build_norm(ctx0, inpL, hparams,
8706
+ model.layers[il].attn_norm, NULL,
8707
+ LLM_NORM_RMS, cb, il);
8708
+ cb(cur, "attn_norm", il);
8709
+
8710
+ // self_attention
8711
+ {
8712
+ // compute Q and K and RoPE them
8713
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8714
+ cb(Qcur, "Qcur", il);
8715
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8716
+ cb(Qcur, "Qcur", il);
8717
+
8718
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8719
+ cb(Kcur, "Kcur", il);
8720
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8721
+ cb(Kcur, "Kcur", il);
8722
+
8723
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8724
+ cb(Vcur, "Vcur", il);
8725
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8726
+ cb(Vcur, "Vcur", il);
8727
+
8728
+ Qcur = ggml_rope_custom(
8729
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8730
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8731
+ ext_factor, attn_factor, beta_fast, beta_slow
8732
+ );
8733
+ cb(Qcur, "Qcur", il);
8734
+
8735
+ Kcur = ggml_rope_custom(
8736
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8737
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8738
+ ext_factor, attn_factor, beta_fast, beta_slow
8739
+ );
8740
+ cb(Kcur, "Kcur", il);
8741
+
8742
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8743
+ model.layers[il].wo, model.layers[il].bo,
8744
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
+ }
8746
+
8747
+ if (il == n_layer - 1) {
8748
+ // skip computing output for unused tokens
8749
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8750
+ n_tokens = n_outputs;
8751
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8752
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8753
+ }
8754
+
8755
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8756
+ cb(ffn_inp, "ffn_inp", il);
8757
+
8758
+ // MoE branch
8759
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8760
+ model.layers[il].ffn_norm, NULL,
8761
+ LLM_NORM_RMS, cb, il);
8762
+ cb(cur, "ffn_norm", il);
8763
+
8764
+ ggml_tensor * moe_out =
8765
+ llm_build_moe_ffn(ctx0, cur,
8766
+ model.layers[il].ffn_gate_inp,
8767
+ model.layers[il].ffn_up_exps,
8768
+ model.layers[il].ffn_gate_exps,
8769
+ model.layers[il].ffn_down_exps,
8770
+ n_expert, n_expert_used,
8771
+ LLM_FFN_SILU, false,
8772
+ cb, il);
8773
+ cb(cur, "ffn_moe_out", il);
8774
+
8775
+ // FFN shared expert
8776
+ {
8777
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8778
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8779
+
8780
+ // sigmoid
8781
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8782
+ cb(cur_gate, "ffn_shexp_gate", il);
8783
+
8784
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8785
+ model.layers[il].ffn_up_shexp, NULL,
8786
+ model.layers[il].ffn_gate_shexp, NULL,
8787
+ model.layers[il].ffn_down_shexp, NULL,
8788
+ NULL,
8789
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8790
+ cb(cur_ffn, "ffn_shexp", il);
8791
+
8792
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8793
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8794
+
8795
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8796
+ cb(moe_out, "ffn_out", il);
8797
+
8798
+ cur = moe_out;
8799
+ }
8800
+
8801
+ cur = ggml_add(ctx0, cur, ffn_inp);
8802
+ cb(cur, "l_out", il);
8803
+
8804
+ // input for next layer
8805
+ inpL = cur;
8806
+ }
8807
+
8808
+ cur = inpL;
8809
+
8810
+ cur = llm_build_norm(ctx0, cur, hparams,
8811
+ model.output_norm, NULL,
8812
+ LLM_NORM_RMS, cb, -1);
8813
+ cb(cur, "result_norm", -1);
8814
+
8815
+ // lm_head
8816
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8817
+ cb(cur, "result_output", -1);
8818
+
8819
+ ggml_build_forward_expand(gf, cur);
8820
+
8821
+ return gf;
8822
+ }
8823
+
8257
8824
  struct ggml_cgraph * build_phi2() {
8258
8825
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8259
8826
 
@@ -9588,6 +10155,139 @@ struct llm_build_context {
9588
10155
  return gf;
9589
10156
 
9590
10157
  }
10158
+
10159
+ // ref: https://allenai.org/olmo
10160
+ // based on the original build_llama() function, changes:
10161
+ // * non-parametric layer norm
10162
+ // * clamp qkv
10163
+ // * removed bias
10164
+ // * removed MoE
10165
+ struct ggml_cgraph * build_olmo() {
10166
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10167
+
10168
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10169
+ int32_t n_tokens = this->n_tokens;
10170
+
10171
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10172
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10173
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10174
+
10175
+ struct ggml_tensor * cur;
10176
+ struct ggml_tensor * inpL;
10177
+
10178
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10179
+
10180
+ // inp_pos - contains the positions
10181
+ struct ggml_tensor * inp_pos = build_inp_pos();
10182
+
10183
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10184
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10185
+
10186
+ for (int il = 0; il < n_layer; ++il) {
10187
+ struct ggml_tensor * inpSA = inpL;
10188
+
10189
+ // norm
10190
+ cur = llm_build_norm(ctx0, inpL, hparams,
10191
+ NULL, NULL,
10192
+ LLM_NORM, cb, il);
10193
+ cb(cur, "attn_norm", il);
10194
+
10195
+ // self-attention
10196
+ {
10197
+ // compute Q and K and RoPE them
10198
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10199
+ cb(Qcur, "Qcur", il);
10200
+ if (hparams.f_clamp_kqv > 0.0f) {
10201
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10202
+ cb(Qcur, "Qcur", il);
10203
+ }
10204
+
10205
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10206
+ cb(Kcur, "Kcur", il);
10207
+ if (hparams.f_clamp_kqv > 0.0f) {
10208
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10209
+ cb(Kcur, "Kcur", il);
10210
+ }
10211
+
10212
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10213
+ cb(Vcur, "Vcur", il);
10214
+ if (hparams.f_clamp_kqv > 0.0f) {
10215
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10216
+ cb(Vcur, "Vcur", il);
10217
+ }
10218
+
10219
+ Qcur = ggml_rope_custom(
10220
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10221
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10222
+ ext_factor, attn_factor, beta_fast, beta_slow
10223
+ );
10224
+ cb(Qcur, "Qcur", il);
10225
+
10226
+ Kcur = ggml_rope_custom(
10227
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10228
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10229
+ ext_factor, attn_factor, beta_fast, beta_slow
10230
+ );
10231
+ cb(Kcur, "Kcur", il);
10232
+
10233
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10234
+ model.layers[il].wo, nullptr,
10235
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
+ }
10237
+
10238
+ if (il == n_layer - 1) {
10239
+ // skip computing output for unused tokens
10240
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10241
+ n_tokens = n_outputs;
10242
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10243
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10244
+ }
10245
+
10246
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10247
+ cb(ffn_inp, "ffn_inp", il);
10248
+
10249
+ // feed-forward network
10250
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10251
+ NULL, NULL,
10252
+ LLM_NORM, cb, il);
10253
+ cb(cur, "ffn_norm", il);
10254
+
10255
+ cur = llm_build_ffn(ctx0, cur,
10256
+ model.layers[il].ffn_up, NULL,
10257
+ model.layers[il].ffn_gate, NULL,
10258
+ model.layers[il].ffn_down, NULL,
10259
+ NULL,
10260
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10261
+ cb(cur, "ffn_out", il);
10262
+
10263
+ cur = ggml_add(ctx0, cur, ffn_inp);
10264
+ cb(cur, "ffn_out", il);
10265
+
10266
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10267
+ if (layer_dir != nullptr) {
10268
+ cur = ggml_add(ctx0, cur, layer_dir);
10269
+ }
10270
+ cb(cur, "l_out", il);
10271
+
10272
+ // input for next layer
10273
+ inpL = cur;
10274
+ }
10275
+
10276
+ cur = inpL;
10277
+
10278
+ cur = llm_build_norm(ctx0, cur, hparams,
10279
+ NULL, NULL,
10280
+ LLM_NORM, cb, -1);
10281
+ cb(cur, "result_norm", -1);
10282
+
10283
+ // lm_head
10284
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10285
+ cb(cur, "result_output", -1);
10286
+
10287
+ ggml_build_forward_expand(gf, cur);
10288
+
10289
+ return gf;
10290
+ }
9591
10291
  };
9592
10292
 
9593
10293
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9737,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
9737
10437
  {
9738
10438
  result = llm.build_qwen2();
9739
10439
  } break;
10440
+ case LLM_ARCH_QWEN2MOE:
10441
+ {
10442
+ result = llm.build_qwen2moe();
10443
+ } break;
9740
10444
  case LLM_ARCH_PHI2:
9741
10445
  {
9742
10446
  result = llm.build_phi2();
@@ -9785,6 +10489,14 @@ static struct ggml_cgraph * llama_build_graph(
9785
10489
  {
9786
10490
  result = llm.build_command_r();
9787
10491
  } break;
10492
+ case LLM_ARCH_DBRX:
10493
+ {
10494
+ result = llm.build_dbrx();
10495
+ } break;
10496
+ case LLM_ARCH_OLMO:
10497
+ {
10498
+ result = llm.build_olmo();
10499
+ } break;
9788
10500
  default:
9789
10501
  GGML_ASSERT(false);
9790
10502
  }
@@ -12915,6 +13627,11 @@ struct llama_beam_search_data {
12915
13627
  }
12916
13628
  llama_logit_info logit_info(ctx);
12917
13629
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13630
+
13631
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13632
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13633
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13634
+
12918
13635
  size_t i=0;
12919
13636
  if (next_beams.size() < n_beams) {
12920
13637
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13535,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13535
14252
  gguf_set_kv (ctx_out, ml.meta);
13536
14253
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13537
14254
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14255
+ // Remove split metadata
14256
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14257
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14258
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13538
14259
 
13539
14260
  if (params->kv_overrides) {
13540
14261
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -14629,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14629
15350
  case LLM_ARCH_MINICPM:
14630
15351
  case LLM_ARCH_XVERSE:
14631
15352
  case LLM_ARCH_COMMAND_R:
15353
+ case LLM_ARCH_OLMO:
14632
15354
  return LLAMA_ROPE_TYPE_NORM;
14633
15355
 
14634
15356
  // the pairs of head values are offset by n_rot/2
14635
15357
  case LLM_ARCH_FALCON:
14636
15358
  case LLM_ARCH_GROK:
15359
+ case LLM_ARCH_DBRX:
14637
15360
  case LLM_ARCH_PERSIMMON:
14638
15361
  case LLM_ARCH_BERT:
14639
15362
  case LLM_ARCH_NOMIC_BERT:
14640
15363
  case LLM_ARCH_STABLELM:
14641
15364
  case LLM_ARCH_QWEN:
14642
15365
  case LLM_ARCH_QWEN2:
15366
+ case LLM_ARCH_QWEN2MOE:
14643
15367
  case LLM_ARCH_PHI2:
14644
15368
  case LLM_ARCH_GEMMA:
14645
15369
  case LLM_ARCH_STARCODER2:
@@ -15320,6 +16044,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15320
16044
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15321
16045
  ctx->output_ids[id] = i;
15322
16046
  }
16047
+
16048
+ ctx->n_outputs = n_outputs;
15323
16049
  }
15324
16050
  }
15325
16051
 
@@ -16472,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
16472
17198
  if (add_ass) {
16473
17199
  ss << "### Response:\n";
16474
17200
  }
17201
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17202
+ // CohereForAI/c4ai-command-r-plus
17203
+ for (auto message : chat) {
17204
+ std::string role(message->role);
17205
+ if (role == "system") {
17206
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17207
+ } else if (role == "user") {
17208
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17209
+ } else if (role == "assistant") {
17210
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17211
+ }
17212
+ }
17213
+ if (add_ass) {
17214
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
+ }
16475
17216
  } else {
16476
17217
  // template not supported
16477
17218
  return -1;