llama_cpp 0.14.5 → 0.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,7 +105,7 @@
105
105
  #endif
106
106
 
107
107
  #define LLAMA_MAX_NODES 8192
108
- #define LLAMA_MAX_EXPERTS 8
108
+ #define LLAMA_MAX_EXPERTS 60
109
109
 
110
110
 
111
111
  //
@@ -209,6 +209,7 @@ enum llm_arch {
209
209
  LLM_ARCH_STABLELM,
210
210
  LLM_ARCH_QWEN,
211
211
  LLM_ARCH_QWEN2,
212
+ LLM_ARCH_QWEN2MOE,
212
213
  LLM_ARCH_PHI2,
213
214
  LLM_ARCH_PLAMO,
214
215
  LLM_ARCH_CODESHELL,
@@ -220,6 +221,8 @@ enum llm_arch {
220
221
  LLM_ARCH_MAMBA,
221
222
  LLM_ARCH_XVERSE,
222
223
  LLM_ARCH_COMMAND_R,
224
+ LLM_ARCH_DBRX,
225
+ LLM_ARCH_OLMO,
223
226
  LLM_ARCH_UNKNOWN,
224
227
  };
225
228
 
@@ -241,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
241
244
  { LLM_ARCH_STABLELM, "stablelm" },
242
245
  { LLM_ARCH_QWEN, "qwen" },
243
246
  { LLM_ARCH_QWEN2, "qwen2" },
247
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
244
248
  { LLM_ARCH_PHI2, "phi2" },
245
249
  { LLM_ARCH_PLAMO, "plamo" },
246
250
  { LLM_ARCH_CODESHELL, "codeshell" },
@@ -252,6 +256,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
256
  { LLM_ARCH_MAMBA, "mamba" },
253
257
  { LLM_ARCH_XVERSE, "xverse" },
254
258
  { LLM_ARCH_COMMAND_R, "command-r" },
259
+ { LLM_ARCH_DBRX, "dbrx" },
260
+ { LLM_ARCH_OLMO, "olmo" },
255
261
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
262
  };
257
263
 
@@ -325,6 +331,10 @@ enum llm_kv {
325
331
  LLM_KV_TOKENIZER_ADD_PREFIX,
326
332
  LLM_KV_TOKENIZER_HF_JSON,
327
333
  LLM_KV_TOKENIZER_RWKV,
334
+ LLM_KV_TOKENIZER_PREFIX_ID,
335
+ LLM_KV_TOKENIZER_SUFFIX_ID,
336
+ LLM_KV_TOKENIZER_MIDDLE_ID,
337
+ LLM_KV_TOKENIZER_EOT_ID,
328
338
  };
329
339
 
330
340
  static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -397,6 +407,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
397
407
  { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
398
408
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
399
409
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
410
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
411
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
412
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
413
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
400
414
  };
401
415
 
402
416
  struct LLM_KV {
@@ -427,6 +441,7 @@ enum llm_tensor {
427
441
  LLM_TENSOR_ATTN_OUT_NORM,
428
442
  LLM_TENSOR_ATTN_ROT_EMBD,
429
443
  LLM_TENSOR_FFN_GATE_INP,
444
+ LLM_TENSOR_FFN_GATE_INP_SHEXP,
430
445
  LLM_TENSOR_FFN_NORM,
431
446
  LLM_TENSOR_FFN_GATE,
432
447
  LLM_TENSOR_FFN_DOWN,
@@ -438,6 +453,9 @@ enum llm_tensor {
438
453
  LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
439
454
  LLM_TENSOR_FFN_GATE_EXPS,
440
455
  LLM_TENSOR_FFN_UP_EXPS,
456
+ LLM_TENSOR_FFN_DOWN_SHEXP,
457
+ LLM_TENSOR_FFN_GATE_SHEXP,
458
+ LLM_TENSOR_FFN_UP_SHEXP,
441
459
  LLM_TENSOR_ATTN_Q_NORM,
442
460
  LLM_TENSOR_ATTN_K_NORM,
443
461
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -700,6 +718,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
700
718
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
701
719
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
702
720
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
721
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
722
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
703
723
  },
704
724
  },
705
725
  {
@@ -735,6 +755,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
735
755
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
736
756
  },
737
757
  },
758
+ {
759
+ LLM_ARCH_QWEN2MOE,
760
+ {
761
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
762
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
763
+ { LLM_TENSOR_OUTPUT, "output" },
764
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
765
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
766
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
767
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
768
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
769
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
770
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
771
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
772
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
773
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
774
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
775
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
776
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
777
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
778
+ },
779
+ },
738
780
  {
739
781
  LLM_ARCH_PHI2,
740
782
  {
@@ -934,6 +976,36 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
934
976
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
935
977
  },
936
978
  },
979
+ {
980
+ LLM_ARCH_DBRX,
981
+ {
982
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
983
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
984
+ { LLM_TENSOR_OUTPUT, "output" },
985
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
986
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
987
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
988
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
989
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
990
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
991
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
992
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
993
+ },
994
+ },
995
+ {
996
+ LLM_ARCH_OLMO,
997
+ {
998
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
999
+ { LLM_TENSOR_OUTPUT, "output" },
1000
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1001
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1002
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1003
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1004
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1005
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1006
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1007
+ },
1008
+ },
937
1009
  {
938
1010
  LLM_ARCH_UNKNOWN,
939
1011
  {
@@ -1690,6 +1762,7 @@ enum e_model {
1690
1762
  MODEL_4B,
1691
1763
  MODEL_7B,
1692
1764
  MODEL_8B,
1765
+ MODEL_12B,
1693
1766
  MODEL_13B,
1694
1767
  MODEL_14B,
1695
1768
  MODEL_15B,
@@ -1705,8 +1778,10 @@ enum e_model {
1705
1778
  MODEL_MEDIUM,
1706
1779
  MODEL_LARGE,
1707
1780
  MODEL_XL,
1781
+ MODEL_A2_7B,
1708
1782
  MODEL_8x7B,
1709
1783
  MODEL_8x22B,
1784
+ MODEL_16x12B,
1710
1785
  };
1711
1786
 
1712
1787
  static const size_t kiB = 1024;
@@ -1890,6 +1965,12 @@ struct llama_layer {
1890
1965
  struct ggml_tensor * ffn_down_exps;
1891
1966
  struct ggml_tensor * ffn_up_exps ;
1892
1967
 
1968
+ // ff shared expert (shexp)
1969
+ struct ggml_tensor * ffn_gate_inp_shexp;
1970
+ struct ggml_tensor * ffn_gate_shexp;
1971
+ struct ggml_tensor * ffn_down_shexp;
1972
+ struct ggml_tensor * ffn_up_shexp;
1973
+
1893
1974
  // ff bias
1894
1975
  struct ggml_tensor * ffn_down_b; // b2
1895
1976
  struct ggml_tensor * ffn_up_b; // b3
@@ -2036,10 +2117,10 @@ struct llama_vocab {
2036
2117
  int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2037
2118
 
2038
2119
  id linefeed_id = 13;
2039
- id special_prefix_id = 32007;
2040
- id special_middle_id = 32009;
2041
- id special_suffix_id = 32008;
2042
- id special_eot_id = 32010;
2120
+ id special_prefix_id = -1;
2121
+ id special_suffix_id = -1;
2122
+ id special_middle_id = -1;
2123
+ id special_eot_id = -1;
2043
2124
 
2044
2125
  bool add_space_prefix = true;
2045
2126
 
@@ -3545,6 +3626,7 @@ static const char * llama_model_type_name(e_model type) {
3545
3626
  case MODEL_3B: return "3B";
3546
3627
  case MODEL_7B: return "7B";
3547
3628
  case MODEL_8B: return "8B";
3629
+ case MODEL_12B: return "12B";
3548
3630
  case MODEL_13B: return "13B";
3549
3631
  case MODEL_14B: return "14B";
3550
3632
  case MODEL_15B: return "15B";
@@ -3560,8 +3642,10 @@ static const char * llama_model_type_name(e_model type) {
3560
3642
  case MODEL_MEDIUM: return "0.4B";
3561
3643
  case MODEL_LARGE: return "0.8B";
3562
3644
  case MODEL_XL: return "1.5B";
3645
+ case MODEL_A2_7B: return "A2.7B";
3563
3646
  case MODEL_8x7B: return "8x7B";
3564
3647
  case MODEL_8x22B: return "8x22B";
3648
+ case MODEL_16x12B: return "16x12B";
3565
3649
  default: return "?B";
3566
3650
  }
3567
3651
  }
@@ -3834,6 +3918,7 @@ static void llm_load_hparams(
3834
3918
  switch (hparams.n_layer) {
3835
3919
  case 24: model.type = e_model::MODEL_1B; break;
3836
3920
  case 32: model.type = e_model::MODEL_3B; break;
3921
+ case 40: model.type = e_model::MODEL_12B; break;
3837
3922
  default: model.type = e_model::MODEL_UNKNOWN;
3838
3923
  }
3839
3924
  } break;
@@ -3858,6 +3943,14 @@ static void llm_load_hparams(
3858
3943
  default: model.type = e_model::MODEL_UNKNOWN;
3859
3944
  }
3860
3945
  } break;
3946
+ case LLM_ARCH_QWEN2MOE:
3947
+ {
3948
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3949
+ switch (hparams.n_layer) {
3950
+ case 24: model.type = e_model::MODEL_A2_7B; break;
3951
+ default: model.type = e_model::MODEL_UNKNOWN;
3952
+ }
3953
+ } break;
3861
3954
  case LLM_ARCH_PHI2:
3862
3955
  {
3863
3956
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3983,6 +4076,28 @@ static void llm_load_hparams(
3983
4076
  default: model.type = e_model::MODEL_UNKNOWN;
3984
4077
  }
3985
4078
  } break;
4079
+ case LLM_ARCH_DBRX:
4080
+ {
4081
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4082
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
4083
+
4084
+ switch (hparams.n_layer) {
4085
+ case 40: model.type = e_model::MODEL_16x12B; break;
4086
+ default: model.type = e_model::MODEL_UNKNOWN;
4087
+ }
4088
+ } break;
4089
+ case LLM_ARCH_OLMO:
4090
+ {
4091
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4092
+ ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
4093
+
4094
+ switch (hparams.n_layer) {
4095
+ case 22: model.type = e_model::MODEL_1B; break;
4096
+ case 32: model.type = e_model::MODEL_7B; break;
4097
+ case 80: model.type = e_model::MODEL_70B; break;
4098
+ default: model.type = e_model::MODEL_UNKNOWN;
4099
+ }
4100
+ } break;
3986
4101
  default: (void)0;
3987
4102
  }
3988
4103
 
@@ -4042,6 +4157,32 @@ static void llm_load_vocab(
4042
4157
  vocab.special_cls_id = -1;
4043
4158
  vocab.special_mask_id = -1;
4044
4159
 
4160
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4161
+ // prior to support of FIM special tokens in GGUF, the following
4162
+ // will allow those models to continue to work. The general names
4163
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4164
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4165
+ // new versions of these models have been published.
4166
+ std::string gen_name;
4167
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4168
+
4169
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4170
+ [](unsigned char c){ return std::tolower(c); });
4171
+
4172
+ if (gen_name.find("code") != std::string::npos) {
4173
+ if (model.arch == LLM_ARCH_LLAMA) {
4174
+ vocab.special_prefix_id = 32007;
4175
+ vocab.special_suffix_id = 32008;
4176
+ vocab.special_middle_id = 32009;
4177
+ vocab.special_eot_id = 32010;
4178
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4179
+ vocab.special_prefix_id = 67;
4180
+ vocab.special_suffix_id = 69;
4181
+ vocab.special_middle_id = 68;
4182
+ vocab.special_eot_id = 70;
4183
+ }
4184
+ }
4185
+
4045
4186
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4046
4187
  if (add_space_prefix_keyidx != -1) {
4047
4188
  vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4155,13 +4296,17 @@ static void llm_load_vocab(
4155
4296
  // special tokens
4156
4297
  {
4157
4298
  const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4158
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4159
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4160
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4161
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4162
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4163
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4164
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4299
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4300
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4301
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4302
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4303
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4304
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4305
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4306
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4307
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4308
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4309
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4165
4310
  };
4166
4311
  for (const auto & it : special_token_types) {
4167
4312
  const std::string & key = kv(std::get<0>(it));
@@ -4378,6 +4523,13 @@ static bool llm_load_tensors(
4378
4523
 
4379
4524
  auto & hparams = model.hparams;
4380
4525
 
4526
+ #ifdef GGML_USE_SYCL
4527
+ // disable MoE with SYCL until mul_mat_id is updated
4528
+ if (hparams.n_expert > 0) {
4529
+ n_gpu_layers = 0;
4530
+ }
4531
+ #endif
4532
+
4381
4533
  model.split_mode = split_mode;
4382
4534
  model.main_gpu = main_gpu;
4383
4535
  model.n_gpu_layers = n_gpu_layers;
@@ -4475,7 +4627,7 @@ static bool llm_load_tensors(
4475
4627
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4476
4628
 
4477
4629
  // for moe merged tensors
4478
- ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4630
+ ctx_size += ggml_tensor_overhead()*n_layer*3;
4479
4631
 
4480
4632
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4481
4633
  for (auto & it : buft_layer_count) {
@@ -4671,6 +4823,39 @@ static bool llm_load_tensors(
4671
4823
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4672
4824
  }
4673
4825
  } break;
4826
+ case LLM_ARCH_DBRX:
4827
+ {
4828
+ if (n_expert == 0) {
4829
+ throw std::runtime_error("DBRX model cannot have zero experts");
4830
+ }
4831
+
4832
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4833
+
4834
+ // output
4835
+ {
4836
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4837
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4838
+ }
4839
+
4840
+ for (int i = 0; i < n_layer; ++i) {
4841
+ ggml_context * ctx_layer = ctx_for_layer(i);
4842
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4843
+
4844
+ auto & layer = model.layers[i];
4845
+
4846
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4847
+
4848
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4849
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4850
+
4851
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
4852
+
4853
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4854
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4855
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
4856
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4857
+ }
4858
+ } break;
4674
4859
  case LLM_ARCH_BAICHUAN:
4675
4860
  {
4676
4861
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4985,8 +5170,13 @@ static bool llm_load_tensors(
4985
5170
  layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
4986
5171
  layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
4987
5172
 
4988
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4989
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
5173
+ // optional q and k layernorms, present in StableLM 2 12B
5174
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false);
5175
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false);
5176
+
5177
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
5178
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false);
5179
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4990
5180
 
4991
5181
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4992
5182
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -5029,7 +5219,13 @@ static bool llm_load_tensors(
5029
5219
  // output
5030
5220
  {
5031
5221
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5032
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5222
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5223
+ // if output is NULL, init from the input tok embed
5224
+ if (model.output == NULL) {
5225
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5226
+ ml.n_created--; // artificial tensor
5227
+ ml.size_data += ggml_nbytes(model.output);
5228
+ }
5033
5229
  }
5034
5230
 
5035
5231
  for (int i = 0; i < n_layer; ++i) {
@@ -5057,6 +5253,54 @@ static bool llm_load_tensors(
5057
5253
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5058
5254
  }
5059
5255
  } break;
5256
+ case LLM_ARCH_QWEN2MOE:
5257
+ {
5258
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5259
+
5260
+ // output
5261
+ {
5262
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5263
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5264
+ }
5265
+
5266
+ for (int i = 0; i < n_layer; ++i) {
5267
+ ggml_context * ctx_layer = ctx_for_layer(i);
5268
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5269
+
5270
+ auto & layer = model.layers[i];
5271
+
5272
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5273
+
5274
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5275
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5276
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5277
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5278
+
5279
+ // optional bias tensors
5280
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5281
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5282
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5283
+
5284
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5285
+
5286
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5287
+
5288
+ GGML_ASSERT(hparams.n_expert > 0);
5289
+ GGML_ASSERT(hparams.n_expert_used > 0);
5290
+
5291
+ // MoE branch
5292
+ auto n_ff_exp = n_ff / hparams.n_expert_used;
5293
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5294
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5295
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5296
+
5297
+ // Shared expert branch
5298
+ layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5299
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5300
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5301
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5302
+ }
5303
+ } break;
5060
5304
  case LLM_ARCH_PHI2:
5061
5305
  {
5062
5306
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5450,6 +5694,37 @@ static bool llm_load_tensors(
5450
5694
  layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5451
5695
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5452
5696
 
5697
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5698
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5699
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5700
+ }
5701
+ } break;
5702
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
5703
+ {
5704
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5705
+
5706
+ // output
5707
+ {
5708
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
5709
+ // if output is NULL, init from the input tok embed
5710
+ if (model.output == NULL) {
5711
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5712
+ ml.n_created--; // artificial tensor
5713
+ ml.size_data += ggml_nbytes(model.output);
5714
+ }
5715
+ }
5716
+
5717
+ for (int i = 0; i < n_layer; ++i) {
5718
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5719
+
5720
+ auto & layer = model.layers[i];
5721
+
5722
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5723
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5724
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5725
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5726
+
5727
+
5453
5728
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5454
5729
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5455
5730
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
@@ -5890,6 +6165,100 @@ static struct ggml_tensor * llm_build_ffn(
5890
6165
  return cur;
5891
6166
  }
5892
6167
 
6168
+ static struct ggml_tensor * llm_build_moe_ffn(
6169
+ struct ggml_context * ctx,
6170
+ struct ggml_tensor * cur,
6171
+ struct ggml_tensor * gate_inp,
6172
+ struct ggml_tensor * up_exps,
6173
+ struct ggml_tensor * gate_exps,
6174
+ struct ggml_tensor * down_exps,
6175
+ int64_t n_expert,
6176
+ int64_t n_expert_used,
6177
+ llm_ffn_op_type type_op,
6178
+ bool norm_w,
6179
+ const llm_build_cb & cb,
6180
+ int il) {
6181
+ int64_t n_embd = cur->ne[0];
6182
+ int64_t n_tokens = cur->ne[1];
6183
+
6184
+ ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens]
6185
+ cb(logits, "ffn_moe_logits", il);
6186
+
6187
+ ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
6188
+ cb(probs, "ffn_moe_probs", il);
6189
+
6190
+ // select experts
6191
+ ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
6192
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
6193
+ cb(selected_experts, "ffn_moe_topk", il);
6194
+
6195
+ ggml_tensor * weights = ggml_get_rows(ctx,
6196
+ ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
6197
+ cb(weights, "ffn_moe_weights", il);
6198
+
6199
+ if (norm_w) {
6200
+ weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
6201
+
6202
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
6203
+ cb(weights_sum, "ffn_moe_weights_sum", il);
6204
+
6205
+ weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
6206
+ cb(weights, "ffn_moe_weights_norm", il);
6207
+
6208
+ weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
6209
+ }
6210
+
6211
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
6212
+ ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6213
+ cb(up, "ffn_moe_up", il);
6214
+
6215
+ ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
6216
+ cb(gate, "ffn_moe_gate", il);
6217
+
6218
+ switch (type_op) {
6219
+ case LLM_FFN_SILU:
6220
+ {
6221
+ gate = ggml_silu(ctx, gate);
6222
+ cb(gate, "ffn_moe_silu", il);
6223
+ } break;
6224
+ case LLM_FFN_GELU:
6225
+ {
6226
+ gate = ggml_gelu(ctx, gate);
6227
+ cb(gate, "ffn_moe_gelu", il);
6228
+ } break;
6229
+ default:
6230
+ GGML_ASSERT(false);
6231
+ }
6232
+
6233
+ ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
6234
+ cb(par, "ffn_moe_gate_par", il);
6235
+
6236
+ ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
6237
+ cb(experts, "ffn_moe_down", il);
6238
+
6239
+ experts = ggml_mul(ctx, experts, weights);
6240
+
6241
+ // aggregate experts
6242
+ ggml_tensor * moe_out = nullptr;
6243
+ for (int i = 0; i < n_expert_used; ++i) {
6244
+ ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
6245
+ experts->nb[2], i*experts->nb[1]);
6246
+
6247
+ if (i == 0) {
6248
+ moe_out = cur_expert;
6249
+ } else {
6250
+ moe_out = ggml_add(ctx, moe_out, cur_expert);
6251
+ }
6252
+ }
6253
+
6254
+ if (n_expert_used == 1) {
6255
+ // avoid returning a non-contiguous tensor
6256
+ moe_out = ggml_cont(ctx, moe_out);
6257
+ }
6258
+
6259
+ return moe_out;
6260
+ }
6261
+
5893
6262
  // if max_alibi_bias > 0 then apply ALiBi
5894
6263
  static struct ggml_tensor * llm_build_kqv(
5895
6264
  struct ggml_context * ctx,
@@ -6433,62 +6802,15 @@ struct llm_build_context {
6433
6802
  LLM_NORM_RMS, cb, il);
6434
6803
  cb(cur, "ffn_norm", il);
6435
6804
 
6436
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6437
- cb(logits, "ffn_moe_logits", il);
6438
-
6439
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6440
- cb(probs, "ffn_moe_probs", il);
6441
-
6442
- // select experts
6443
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6444
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
6445
-
6446
- ggml_tensor * weights = ggml_get_rows(ctx0,
6447
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6448
- cb(weights, "ffn_moe_weights", il);
6449
-
6450
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
6451
-
6452
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6453
- cb(weights_sum, "ffn_moe_weights_sum", il);
6454
-
6455
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6456
- cb(weights, "ffn_moe_weights_norm", il);
6457
-
6458
- // compute expert outputs
6459
- ggml_tensor * moe_out = nullptr;
6460
-
6461
- for (int i = 0; i < n_expert_used; ++i) {
6462
- ggml_tensor * cur_expert;
6463
-
6464
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6465
- cb(cur_up, "ffn_moe_up", il);
6466
-
6467
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6468
- cb(cur_gate, "ffn_moe_gate", il);
6469
-
6470
- cur_gate = ggml_silu(ctx0, cur_gate);
6471
- cb(cur_gate, "ffn_moe_silu", il);
6472
-
6473
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6474
- cb(cur_expert, "ffn_moe_gate_par", il);
6475
-
6476
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6477
- cb(cur_expert, "ffn_moe_down", il);
6478
-
6479
- cur_expert = ggml_mul(ctx0, cur_expert,
6480
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
6481
- cb(cur_expert, "ffn_moe_weighted", il);
6482
-
6483
- if (i == 0) {
6484
- moe_out = cur_expert;
6485
- } else {
6486
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
6487
- cb(moe_out, "ffn_moe_out", il);
6488
- }
6489
- }
6490
-
6491
- cur = moe_out;
6805
+ cur = llm_build_moe_ffn(ctx0, cur,
6806
+ model.layers[il].ffn_gate_inp,
6807
+ model.layers[il].ffn_up_exps,
6808
+ model.layers[il].ffn_gate_exps,
6809
+ model.layers[il].ffn_down_exps,
6810
+ n_expert, n_expert_used,
6811
+ LLM_FFN_SILU, true,
6812
+ cb, il);
6813
+ cb(cur, "ffn_moe_out", il);
6492
6814
  }
6493
6815
 
6494
6816
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6967,74 +7289,158 @@ struct llm_build_context {
6967
7289
  LLM_NORM_RMS, cb, il);
6968
7290
  cb(cur, "ffn_norm", il);
6969
7291
 
6970
- ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6971
- cb(logits, "ffn_moe_logits", il);
7292
+ cur = llm_build_moe_ffn(ctx0, cur,
7293
+ model.layers[il].ffn_gate_inp,
7294
+ model.layers[il].ffn_up_exps,
7295
+ model.layers[il].ffn_gate_exps,
7296
+ model.layers[il].ffn_down_exps,
7297
+ n_expert, n_expert_used,
7298
+ LLM_FFN_GELU, true,
7299
+ cb, il);
7300
+ cb(cur, "ffn_moe_out", il);
7301
+
7302
+ // Grok
7303
+ // if layer_out_norm is present then apply it before adding the input
7304
+ // Idea: maybe ffn_out_norm is a better name
7305
+ if (model.layers[il].layer_out_norm) {
7306
+ cur = llm_build_norm(ctx0, cur, hparams,
7307
+ model.layers[il].layer_out_norm, NULL,
7308
+ LLM_NORM_RMS, cb, il);
7309
+ cb(cur, "layer_out_norm", il);
7310
+ }
7311
+
7312
+ cur = ggml_add(ctx0, cur, ffn_inp);
7313
+ cb(cur, "ffn_out", il);
7314
+
7315
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
7316
+ if (layer_dir != nullptr) {
7317
+ cur = ggml_add(ctx0, cur, layer_dir);
7318
+ }
7319
+ cb(cur, "l_out", il);
7320
+
7321
+ // input for next layer
7322
+ inpL = cur;
7323
+ }
6972
7324
 
6973
- ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
6974
- cb(probs, "ffn_moe_probs", il);
7325
+ cur = inpL;
6975
7326
 
6976
- // select experts
6977
- ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
6978
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
7327
+ cur = llm_build_norm(ctx0, cur, hparams,
7328
+ model.output_norm, NULL,
7329
+ LLM_NORM_RMS, cb, -1);
7330
+ cb(cur, "result_norm", -1);
6979
7331
 
6980
- ggml_tensor * weights = ggml_get_rows(ctx0,
6981
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
6982
- cb(weights, "ffn_moe_weights", il);
7332
+ // lm_head
7333
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6983
7334
 
6984
- weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
7335
+ // Grok
7336
+ // multiply logits by output_multiplier_scale of 0.5773502691896257
6985
7337
 
6986
- ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
6987
- cb(weights_sum, "ffn_moe_weights_sum", il);
7338
+ cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
6988
7339
 
6989
- weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
6990
- cb(weights, "ffn_moe_weights_norm", il);
7340
+ cb(cur, "result_output", -1);
6991
7341
 
6992
- // compute expert outputs
6993
- ggml_tensor * moe_out = nullptr;
7342
+ ggml_build_forward_expand(gf, cur);
6994
7343
 
6995
- for (int i = 0; i < n_expert_used; ++i) {
6996
- ggml_tensor * cur_expert;
7344
+ return gf;
7345
+ }
6997
7346
 
6998
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6999
- cb(cur_up, "ffn_moe_up", il);
7347
+ struct ggml_cgraph * build_dbrx() {
7348
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7000
7349
 
7001
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
7002
- cb(cur_gate, "ffn_moe_gate", il);
7350
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
7351
+ int32_t n_tokens = this->n_tokens;
7003
7352
 
7004
- //GeLU
7005
- cur_gate = ggml_gelu(ctx0, cur_gate);
7006
- cb(cur_gate, "ffn_moe_gelu", il);
7353
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7354
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7355
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7356
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7007
7357
 
7008
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
7009
- cb(cur_expert, "ffn_moe_gate_par", il);
7358
+ struct ggml_tensor * cur;
7359
+ struct ggml_tensor * inpL;
7010
7360
 
7011
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
7012
- cb(cur_expert, "ffn_moe_down", il);
7361
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
7013
7362
 
7014
- cur_expert = ggml_mul(ctx0, cur_expert,
7015
- ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
7016
- cb(cur_expert, "ffn_moe_weighted", il);
7363
+ // inp_pos - contains the positions
7364
+ struct ggml_tensor * inp_pos = build_inp_pos();
7017
7365
 
7018
- if (i == 0) {
7019
- moe_out = cur_expert;
7020
- } else {
7021
- moe_out = ggml_add(ctx0, moe_out, cur_expert);
7022
- cb(moe_out, "ffn_moe_out", il);
7023
- }
7024
- }
7366
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7367
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7025
7368
 
7026
- cur = moe_out;
7369
+ for (int il = 0; il < n_layer; ++il) {
7370
+ struct ggml_tensor * inpSA = inpL;
7027
7371
 
7028
- // Grok
7029
- // if layer_out_norm is present then apply it before adding the input
7030
- // Idea: maybe ffn_out_norm is a better name
7031
- if (model.layers[il].layer_out_norm) {
7032
- cur = llm_build_norm(ctx0, cur, hparams,
7033
- model.layers[il].layer_out_norm, NULL,
7034
- LLM_NORM_RMS, cb, il);
7035
- cb(cur, "layer_out_norm", il);
7372
+ // norm
7373
+ cur = llm_build_norm(ctx0, inpL, hparams,
7374
+ model.layers[il].attn_norm, NULL,
7375
+ LLM_NORM, cb, il);
7376
+ cb(cur, "attn_norm", il);
7377
+
7378
+ // self-attention
7379
+ {
7380
+ struct ggml_tensor * Qcur = nullptr;
7381
+ struct ggml_tensor * Kcur = nullptr;
7382
+ struct ggml_tensor * Vcur = nullptr;
7383
+
7384
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
7385
+ cb(cur, "wqkv", il);
7386
+
7387
+ cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
7388
+ cb(cur, "wqkv_clamped", il);
7389
+
7390
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7391
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7392
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7393
+
7394
+ cb(Qcur, "Qcur", il);
7395
+ cb(Kcur, "Kcur", il);
7396
+ cb(Vcur, "Vcur", il);
7397
+
7398
+ Qcur = ggml_rope_custom(
7399
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7400
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7401
+ ext_factor, attn_factor, beta_fast, beta_slow
7402
+ );
7403
+ cb(Qcur, "Qcur", il);
7404
+
7405
+ Kcur = ggml_rope_custom(
7406
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7407
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7408
+ ext_factor, attn_factor, beta_fast, beta_slow
7409
+ );
7410
+ cb(Kcur, "Kcur", il);
7411
+
7412
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7413
+ model.layers[il].wo, NULL,
7414
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7415
  }
7037
7416
 
7417
+ if (il == n_layer - 1) {
7418
+ // skip computing output for unused tokens
7419
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7420
+ n_tokens = n_outputs;
7421
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7422
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7423
+ }
7424
+
7425
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7426
+ cb(ffn_inp, "ffn_inp", il);
7427
+
7428
+ // feed-forward network
7429
+ // MoE branch
7430
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7431
+ model.layers[il].attn_out_norm, NULL,
7432
+ LLM_NORM, cb, il);
7433
+ cb(cur, "attn_out_norm", il);
7434
+
7435
+ cur = llm_build_moe_ffn(ctx0, cur,
7436
+ model.layers[il].ffn_gate_inp,
7437
+ model.layers[il].ffn_up_exps,
7438
+ model.layers[il].ffn_gate_exps,
7439
+ model.layers[il].ffn_down_exps,
7440
+ n_expert, n_expert_used,
7441
+ LLM_FFN_SILU, true,
7442
+ cb, il);
7443
+ cb(cur, "ffn_moe_out", il);
7038
7444
 
7039
7445
  cur = ggml_add(ctx0, cur, ffn_inp);
7040
7446
  cb(cur, "ffn_out", il);
@@ -7052,18 +7458,13 @@ struct llm_build_context {
7052
7458
  cur = inpL;
7053
7459
 
7054
7460
  cur = llm_build_norm(ctx0, cur, hparams,
7055
- model.output_norm, NULL,
7056
- LLM_NORM_RMS, cb, -1);
7461
+ model.output_norm, NULL,
7462
+ LLM_NORM, cb, -1);
7057
7463
  cb(cur, "result_norm", -1);
7058
7464
 
7059
7465
  // lm_head
7060
7466
  cur = ggml_mul_mat(ctx0, model.output, cur);
7061
7467
 
7062
- // Grok
7063
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7064
-
7065
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7066
-
7067
7468
  cb(cur, "result_output", -1);
7068
7469
 
7069
7470
  ggml_build_forward_expand(gf, cur);
@@ -7923,7 +8324,7 @@ struct llm_build_context {
7923
8324
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7924
8325
 
7925
8326
  for (int il = 0; il < n_layer; ++il) {
7926
- struct ggml_tensor * inpSA = inpL;
8327
+
7927
8328
 
7928
8329
  // norm
7929
8330
  cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7932,6 +8333,8 @@ struct llm_build_context {
7932
8333
  LLM_NORM, cb, il);
7933
8334
  cb(cur, "attn_norm", il);
7934
8335
 
8336
+ struct ggml_tensor * inpSA = cur;
8337
+
7935
8338
  // self-attention
7936
8339
  {
7937
8340
  // compute Q and K and RoPE them
@@ -7956,15 +8359,36 @@ struct llm_build_context {
7956
8359
  cb(Vcur, "Vcur", il);
7957
8360
  }
7958
8361
 
8362
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8363
+ cb(Qcur, "Qcur", il);
8364
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8365
+ cb(Kcur, "Kcur", il);
8366
+
8367
+ if (model.layers[il].attn_q_norm) {
8368
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8369
+ model.layers[il].attn_q_norm,
8370
+ NULL,
8371
+ LLM_NORM, cb, il);
8372
+ cb(Qcur, "Qcur", il);
8373
+ }
8374
+ if (model.layers[il].attn_k_norm) {
8375
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8376
+ model.layers[il].attn_k_norm,
8377
+ NULL,
8378
+ LLM_NORM, cb, il);
8379
+ cb(Kcur, "Kcur", il);
8380
+ }
8381
+
8382
+
7959
8383
  Qcur = ggml_rope_custom(
7960
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8384
+ ctx0, Qcur, inp_pos,
7961
8385
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7962
8386
  ext_factor, attn_factor, beta_fast, beta_slow
7963
8387
  );
7964
8388
  cb(Qcur, "Qcur", il);
7965
8389
 
7966
8390
  Kcur = ggml_rope_custom(
7967
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8391
+ ctx0, Kcur, inp_pos,
7968
8392
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7969
8393
  ext_factor, attn_factor, beta_fast, beta_slow
7970
8394
  );
@@ -7979,20 +8403,25 @@ struct llm_build_context {
7979
8403
  // skip computing output for unused tokens
7980
8404
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
7981
8405
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8406
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7982
8407
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7983
8408
  }
7984
8409
 
7985
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8410
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7986
8411
  cb(ffn_inp, "ffn_inp", il);
7987
8412
 
7988
8413
  // feed-forward network
7989
8414
  {
7990
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
7991
- model.layers[il].ffn_norm,
7992
- model.layers[il].ffn_norm_b,
7993
- LLM_NORM, cb, il);
7994
- cb(cur, "ffn_norm", il);
7995
-
8415
+ if (model.layers[il].ffn_norm) {
8416
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8417
+ model.layers[il].ffn_norm,
8418
+ model.layers[il].ffn_norm_b,
8419
+ LLM_NORM, cb, il);
8420
+ cb(cur, "ffn_norm", il);
8421
+ } else {
8422
+ // parallel residual
8423
+ cur = inpSA;
8424
+ }
7996
8425
  cur = llm_build_ffn(ctx0, cur,
7997
8426
  model.layers[il].ffn_up, NULL,
7998
8427
  model.layers[il].ffn_gate, NULL,
@@ -8182,12 +8611,6 @@ struct llm_build_context {
8182
8611
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8183
8612
  cb(Vcur, "Vcur", il);
8184
8613
 
8185
- // these nodes are added to the graph together so that they are not reordered
8186
- // by doing so, the number of splits in the graph is reduced
8187
- ggml_build_forward_expand(gf, Qcur);
8188
- ggml_build_forward_expand(gf, Kcur);
8189
- ggml_build_forward_expand(gf, Vcur);
8190
-
8191
8614
  Qcur = ggml_rope_custom(
8192
8615
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8193
8616
  n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -8254,6 +8677,150 @@ struct llm_build_context {
8254
8677
  return gf;
8255
8678
  }
8256
8679
 
8680
+ struct ggml_cgraph * build_qwen2moe() {
8681
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8682
+
8683
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
8684
+ int32_t n_tokens = this->n_tokens;
8685
+
8686
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8687
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8688
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8689
+
8690
+ struct ggml_tensor * cur;
8691
+ struct ggml_tensor * inpL;
8692
+
8693
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
8694
+
8695
+ // inp_pos - contains the positions
8696
+ struct ggml_tensor * inp_pos = build_inp_pos();
8697
+
8698
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8699
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8700
+
8701
+ for (int il = 0; il < n_layer; ++il) {
8702
+ struct ggml_tensor * inpSA = inpL;
8703
+
8704
+ // norm
8705
+ cur = llm_build_norm(ctx0, inpL, hparams,
8706
+ model.layers[il].attn_norm, NULL,
8707
+ LLM_NORM_RMS, cb, il);
8708
+ cb(cur, "attn_norm", il);
8709
+
8710
+ // self_attention
8711
+ {
8712
+ // compute Q and K and RoPE them
8713
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
8714
+ cb(Qcur, "Qcur", il);
8715
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8716
+ cb(Qcur, "Qcur", il);
8717
+
8718
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
8719
+ cb(Kcur, "Kcur", il);
8720
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8721
+ cb(Kcur, "Kcur", il);
8722
+
8723
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
8724
+ cb(Vcur, "Vcur", il);
8725
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8726
+ cb(Vcur, "Vcur", il);
8727
+
8728
+ Qcur = ggml_rope_custom(
8729
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
8730
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8731
+ ext_factor, attn_factor, beta_fast, beta_slow
8732
+ );
8733
+ cb(Qcur, "Qcur", il);
8734
+
8735
+ Kcur = ggml_rope_custom(
8736
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
8737
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
8738
+ ext_factor, attn_factor, beta_fast, beta_slow
8739
+ );
8740
+ cb(Kcur, "Kcur", il);
8741
+
8742
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
8743
+ model.layers[il].wo, model.layers[il].bo,
8744
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8745
+ }
8746
+
8747
+ if (il == n_layer - 1) {
8748
+ // skip computing output for unused tokens
8749
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
8750
+ n_tokens = n_outputs;
8751
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8752
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8753
+ }
8754
+
8755
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8756
+ cb(ffn_inp, "ffn_inp", il);
8757
+
8758
+ // MoE branch
8759
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
8760
+ model.layers[il].ffn_norm, NULL,
8761
+ LLM_NORM_RMS, cb, il);
8762
+ cb(cur, "ffn_norm", il);
8763
+
8764
+ ggml_tensor * moe_out =
8765
+ llm_build_moe_ffn(ctx0, cur,
8766
+ model.layers[il].ffn_gate_inp,
8767
+ model.layers[il].ffn_up_exps,
8768
+ model.layers[il].ffn_gate_exps,
8769
+ model.layers[il].ffn_down_exps,
8770
+ n_expert, n_expert_used,
8771
+ LLM_FFN_SILU, false,
8772
+ cb, il);
8773
+ cb(cur, "ffn_moe_out", il);
8774
+
8775
+ // FFN shared expert
8776
+ {
8777
+ ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
8778
+ cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
8779
+
8780
+ // sigmoid
8781
+ ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
8782
+ cb(cur_gate, "ffn_shexp_gate", il);
8783
+
8784
+ ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
8785
+ model.layers[il].ffn_up_shexp, NULL,
8786
+ model.layers[il].ffn_gate_shexp, NULL,
8787
+ model.layers[il].ffn_down_shexp, NULL,
8788
+ NULL,
8789
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
8790
+ cb(cur_ffn, "ffn_shexp", il);
8791
+
8792
+ ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
8793
+ cb(ffn_shexp_out, "ffn_shexp_out", il);
8794
+
8795
+ moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
8796
+ cb(moe_out, "ffn_out", il);
8797
+
8798
+ cur = moe_out;
8799
+ }
8800
+
8801
+ cur = ggml_add(ctx0, cur, ffn_inp);
8802
+ cb(cur, "l_out", il);
8803
+
8804
+ // input for next layer
8805
+ inpL = cur;
8806
+ }
8807
+
8808
+ cur = inpL;
8809
+
8810
+ cur = llm_build_norm(ctx0, cur, hparams,
8811
+ model.output_norm, NULL,
8812
+ LLM_NORM_RMS, cb, -1);
8813
+ cb(cur, "result_norm", -1);
8814
+
8815
+ // lm_head
8816
+ cur = ggml_mul_mat(ctx0, model.output, cur);
8817
+ cb(cur, "result_output", -1);
8818
+
8819
+ ggml_build_forward_expand(gf, cur);
8820
+
8821
+ return gf;
8822
+ }
8823
+
8257
8824
  struct ggml_cgraph * build_phi2() {
8258
8825
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
8259
8826
 
@@ -9588,6 +10155,139 @@ struct llm_build_context {
9588
10155
  return gf;
9589
10156
 
9590
10157
  }
10158
+
10159
+ // ref: https://allenai.org/olmo
10160
+ // based on the original build_llama() function, changes:
10161
+ // * non-parametric layer norm
10162
+ // * clamp qkv
10163
+ // * removed bias
10164
+ // * removed MoE
10165
+ struct ggml_cgraph * build_olmo() {
10166
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
10167
+
10168
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
10169
+ int32_t n_tokens = this->n_tokens;
10170
+
10171
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10172
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10173
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10174
+
10175
+ struct ggml_tensor * cur;
10176
+ struct ggml_tensor * inpL;
10177
+
10178
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
10179
+
10180
+ // inp_pos - contains the positions
10181
+ struct ggml_tensor * inp_pos = build_inp_pos();
10182
+
10183
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10184
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
10185
+
10186
+ for (int il = 0; il < n_layer; ++il) {
10187
+ struct ggml_tensor * inpSA = inpL;
10188
+
10189
+ // norm
10190
+ cur = llm_build_norm(ctx0, inpL, hparams,
10191
+ NULL, NULL,
10192
+ LLM_NORM, cb, il);
10193
+ cb(cur, "attn_norm", il);
10194
+
10195
+ // self-attention
10196
+ {
10197
+ // compute Q and K and RoPE them
10198
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
10199
+ cb(Qcur, "Qcur", il);
10200
+ if (hparams.f_clamp_kqv > 0.0f) {
10201
+ Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10202
+ cb(Qcur, "Qcur", il);
10203
+ }
10204
+
10205
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
10206
+ cb(Kcur, "Kcur", il);
10207
+ if (hparams.f_clamp_kqv > 0.0f) {
10208
+ Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10209
+ cb(Kcur, "Kcur", il);
10210
+ }
10211
+
10212
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
10213
+ cb(Vcur, "Vcur", il);
10214
+ if (hparams.f_clamp_kqv > 0.0f) {
10215
+ Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
10216
+ cb(Vcur, "Vcur", il);
10217
+ }
10218
+
10219
+ Qcur = ggml_rope_custom(
10220
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
10221
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10222
+ ext_factor, attn_factor, beta_fast, beta_slow
10223
+ );
10224
+ cb(Qcur, "Qcur", il);
10225
+
10226
+ Kcur = ggml_rope_custom(
10227
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
10228
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
10229
+ ext_factor, attn_factor, beta_fast, beta_slow
10230
+ );
10231
+ cb(Kcur, "Kcur", il);
10232
+
10233
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
10234
+ model.layers[il].wo, nullptr,
10235
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10236
+ }
10237
+
10238
+ if (il == n_layer - 1) {
10239
+ // skip computing output for unused tokens
10240
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
10241
+ n_tokens = n_outputs;
10242
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10243
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10244
+ }
10245
+
10246
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10247
+ cb(ffn_inp, "ffn_inp", il);
10248
+
10249
+ // feed-forward network
10250
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
10251
+ NULL, NULL,
10252
+ LLM_NORM, cb, il);
10253
+ cb(cur, "ffn_norm", il);
10254
+
10255
+ cur = llm_build_ffn(ctx0, cur,
10256
+ model.layers[il].ffn_up, NULL,
10257
+ model.layers[il].ffn_gate, NULL,
10258
+ model.layers[il].ffn_down, NULL,
10259
+ NULL,
10260
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
10261
+ cb(cur, "ffn_out", il);
10262
+
10263
+ cur = ggml_add(ctx0, cur, ffn_inp);
10264
+ cb(cur, "ffn_out", il);
10265
+
10266
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
10267
+ if (layer_dir != nullptr) {
10268
+ cur = ggml_add(ctx0, cur, layer_dir);
10269
+ }
10270
+ cb(cur, "l_out", il);
10271
+
10272
+ // input for next layer
10273
+ inpL = cur;
10274
+ }
10275
+
10276
+ cur = inpL;
10277
+
10278
+ cur = llm_build_norm(ctx0, cur, hparams,
10279
+ NULL, NULL,
10280
+ LLM_NORM, cb, -1);
10281
+ cb(cur, "result_norm", -1);
10282
+
10283
+ // lm_head
10284
+ cur = ggml_mul_mat(ctx0, model.output, cur);
10285
+ cb(cur, "result_output", -1);
10286
+
10287
+ ggml_build_forward_expand(gf, cur);
10288
+
10289
+ return gf;
10290
+ }
9591
10291
  };
9592
10292
 
9593
10293
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -9737,6 +10437,10 @@ static struct ggml_cgraph * llama_build_graph(
9737
10437
  {
9738
10438
  result = llm.build_qwen2();
9739
10439
  } break;
10440
+ case LLM_ARCH_QWEN2MOE:
10441
+ {
10442
+ result = llm.build_qwen2moe();
10443
+ } break;
9740
10444
  case LLM_ARCH_PHI2:
9741
10445
  {
9742
10446
  result = llm.build_phi2();
@@ -9785,6 +10489,14 @@ static struct ggml_cgraph * llama_build_graph(
9785
10489
  {
9786
10490
  result = llm.build_command_r();
9787
10491
  } break;
10492
+ case LLM_ARCH_DBRX:
10493
+ {
10494
+ result = llm.build_dbrx();
10495
+ } break;
10496
+ case LLM_ARCH_OLMO:
10497
+ {
10498
+ result = llm.build_olmo();
10499
+ } break;
9788
10500
  default:
9789
10501
  GGML_ASSERT(false);
9790
10502
  }
@@ -12915,6 +13627,11 @@ struct llama_beam_search_data {
12915
13627
  }
12916
13628
  llama_logit_info logit_info(ctx);
12917
13629
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
13630
+
13631
+ // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
13632
+ // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
13633
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
13634
+
12918
13635
  size_t i=0;
12919
13636
  if (next_beams.size() < n_beams) {
12920
13637
  for (; next_beams.size() < n_beams ; ++i) {
@@ -13535,6 +14252,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13535
14252
  gguf_set_kv (ctx_out, ml.meta);
13536
14253
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
13537
14254
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
14255
+ // Remove split metadata
14256
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
14257
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
14258
+ gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
13538
14259
 
13539
14260
  if (params->kv_overrides) {
13540
14261
  const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
@@ -14629,17 +15350,20 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14629
15350
  case LLM_ARCH_MINICPM:
14630
15351
  case LLM_ARCH_XVERSE:
14631
15352
  case LLM_ARCH_COMMAND_R:
15353
+ case LLM_ARCH_OLMO:
14632
15354
  return LLAMA_ROPE_TYPE_NORM;
14633
15355
 
14634
15356
  // the pairs of head values are offset by n_rot/2
14635
15357
  case LLM_ARCH_FALCON:
14636
15358
  case LLM_ARCH_GROK:
15359
+ case LLM_ARCH_DBRX:
14637
15360
  case LLM_ARCH_PERSIMMON:
14638
15361
  case LLM_ARCH_BERT:
14639
15362
  case LLM_ARCH_NOMIC_BERT:
14640
15363
  case LLM_ARCH_STABLELM:
14641
15364
  case LLM_ARCH_QWEN:
14642
15365
  case LLM_ARCH_QWEN2:
15366
+ case LLM_ARCH_QWEN2MOE:
14643
15367
  case LLM_ARCH_PHI2:
14644
15368
  case LLM_ARCH_GEMMA:
14645
15369
  case LLM_ARCH_STARCODER2:
@@ -15320,6 +16044,8 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
15320
16044
  GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
15321
16045
  ctx->output_ids[id] = i;
15322
16046
  }
16047
+
16048
+ ctx->n_outputs = n_outputs;
15323
16049
  }
15324
16050
  }
15325
16051
 
@@ -16472,6 +17198,21 @@ static int32_t llama_chat_apply_template_internal(
16472
17198
  if (add_ass) {
16473
17199
  ss << "### Response:\n";
16474
17200
  }
17201
+ } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
17202
+ // CohereForAI/c4ai-command-r-plus
17203
+ for (auto message : chat) {
17204
+ std::string role(message->role);
17205
+ if (role == "system") {
17206
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17207
+ } else if (role == "user") {
17208
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17209
+ } else if (role == "assistant") {
17210
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
17211
+ }
17212
+ }
17213
+ if (add_ass) {
17214
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
17215
+ }
16475
17216
  } else {
16476
17217
  // template not supported
16477
17218
  return -1;