@fugood/llama.node 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +484 -204
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +156 -15
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/json-partial.cpp +51 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  19. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  21. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  31. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
  40. package/src/llama.cpp/include/llama.h +8 -0
  41. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  42. package/src/llama.cpp/src/llama-arch.h +22 -0
  43. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  44. package/src/llama.cpp/src/llama-context.cpp +6 -0
  45. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  46. package/src/llama.cpp/src/llama-graph.h +10 -1
  47. package/src/llama.cpp/src/llama-hparams.cpp +5 -1
  48. package/src/llama.cpp/src/llama-hparams.h +17 -2
  49. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  50. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  51. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  52. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  53. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  54. package/src/llama.cpp/src/llama-model.cpp +572 -45
  55. package/src/llama.cpp/src/llama-model.h +18 -0
  56. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  57. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  58. package/src/llama.cpp/src/llama-vocab.h +41 -40
  59. package/src/llama.cpp/src/unicode.h +43 -0
@@ -114,6 +114,7 @@ const char * llm_type_name(llm_type type) {
114
114
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
115
115
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
116
116
  case LLM_TYPE_A13B: return "A13B";
117
+ case LLM_TYPE_8B_A1B: return "8B.A1B";
117
118
  case LLM_TYPE_21B_A3B: return "21B.A3B";
118
119
  case LLM_TYPE_30B_A3B: return "30B.A3B";
119
120
  case LLM_TYPE_106B_A12B: return "106B.A12B";
@@ -310,7 +311,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
310
311
  }
311
312
 
312
313
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
313
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
314
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
314
315
  buft_list_t buft_list;
315
316
 
316
317
  // add ACCEL buffer types
@@ -331,11 +332,13 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
331
332
  // generally, this will be done using the first device in the list
332
333
  // a better approach would be to handle this on a weight-by-weight basis using the offload_op
333
334
  // function of the device to determine if it would benefit from being stored in a host buffer
334
- for (auto * dev : devices) {
335
- ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
336
- if (buft) {
337
- buft_list.emplace_back(dev, buft);
338
- break;
335
+ if (!no_host) {
336
+ for (auto * dev : devices) {
337
+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
338
+ if (buft) {
339
+ buft_list.emplace_back(dev, buft);
340
+ break;
341
+ }
339
342
  }
340
343
  }
341
344
 
@@ -512,9 +515,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
512
515
  llm_arch_is_recurrent(ml.get_arch()));
513
516
 
514
517
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
515
-
516
518
  std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
517
519
 
520
+ std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
521
+ std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
522
+ std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
523
+ std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
524
+
518
525
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
519
526
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
520
527
 
@@ -675,10 +682,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
675
682
  } break;
676
683
  case LLM_ARCH_MINICPM:
677
684
  {
685
+ // Backward-compatible defaults for older MiniCPM GGUFs
686
+ hparams.f_embedding_scale = 12.0f;
687
+ hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
688
+ hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
689
+
678
690
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
679
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
680
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
681
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
691
+
692
+ // Optional KV reads, override defaults if present in newer GGUF exports
693
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
694
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
695
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
682
696
 
683
697
  // MiniCPM uses rope by default, unlike Granite which uses it as a switch
684
698
  hparams.rope_finetuned = true;
@@ -1077,7 +1091,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1077
1091
  }
1078
1092
  break;
1079
1093
  default: type = LLM_TYPE_UNKNOWN;
1080
- }
1094
+ }
1095
+
1096
+ // Load attention parameters
1097
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1098
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1081
1099
  } break;
1082
1100
  case LLM_ARCH_GPT2:
1083
1101
  {
@@ -1200,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1200
1218
  hparams.set_swa_pattern(6);
1201
1219
 
1202
1220
  hparams.causal_attn = false; // embeddings do not use causal attention
1203
- hparams.rope_freq_base_train_swa = 10000.0f;
1221
+ hparams.rope_freq_base_train_swa = 10000.0f;
1204
1222
  hparams.rope_freq_scale_train_swa = 1.0f;
1205
1223
 
1206
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1224
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1207
1225
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1208
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1226
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1227
+
1228
+ //applied only if model converted with --sentence-transformers-dense-modules
1229
+ ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1230
+ ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1231
+ ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1232
+ ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1233
+
1234
+ GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1235
+ GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1209
1236
 
1210
1237
  switch (hparams.n_layer) {
1211
1238
  case 24: type = LLM_TYPE_0_3B; break;
@@ -1978,13 +2005,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1978
2005
  for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1979
2006
  hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1980
2007
  }
2008
+ hparams.n_layer_dense_lead = hparams.n_layer;
1981
2009
  switch (hparams.n_ff()) {
1982
2010
  case 4608: type = LLM_TYPE_350M; break;
1983
2011
  case 6912: type = LLM_TYPE_700M; break;
1984
2012
  case 8192: type = LLM_TYPE_1_2B; break;
1985
2013
  case 10752: type = LLM_TYPE_2_6B; break;
1986
- default: type = LLM_TYPE_UNKNOWN;
2014
+ default: type = LLM_TYPE_UNKNOWN;
2015
+ }
2016
+ } break;
2017
+ case LLM_ARCH_LFM2MOE:
2018
+ {
2019
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
2020
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2021
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2022
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2023
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2024
+
2025
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2026
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1987
2027
  }
2028
+
2029
+ type = LLM_TYPE_8B_A1B;
1988
2030
  } break;
1989
2031
  case LLM_ARCH_SMALLTHINKER:
1990
2032
  {
@@ -2009,6 +2051,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2009
2051
  default: type = LLM_TYPE_UNKNOWN;
2010
2052
  }
2011
2053
  } break;
2054
+ case LLM_ARCH_GROVEMOE:
2055
+ {
2056
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2057
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
2058
+ ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
2059
+ ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
2060
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2061
+
2062
+ switch (hparams.n_layer) {
2063
+ case 48: type = LLM_TYPE_30B_A3B; break;
2064
+ default: type = LLM_TYPE_UNKNOWN;
2065
+ }
2066
+ } break;
2067
+ case LLM_ARCH_APERTUS:
2068
+ {
2069
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2070
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
2071
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
2072
+ ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
2073
+ ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
2074
+
2075
+ switch (hparams.n_layer) {
2076
+ case 32: type = LLM_TYPE_8B; break;
2077
+ default: type = LLM_TYPE_UNKNOWN;
2078
+ }
2079
+ } break;
2012
2080
  default: throw std::runtime_error("unsupported model architecture");
2013
2081
  }
2014
2082
 
@@ -2042,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2042
2110
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2043
2111
 
2044
2112
  // build a list of buffer types for the CPU and GPU devices
2045
- pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
2113
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2046
2114
  for (auto * dev : devices) {
2047
2115
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2048
2116
  // add CPU buffer types as a fallback
@@ -3167,6 +3235,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3167
3235
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3168
3236
  }
3169
3237
 
3238
+ // output rerank head
3239
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3240
+
3170
3241
  for (int i = 0; i < n_layer; ++i) {
3171
3242
  auto & layer = layers[i];
3172
3243
 
@@ -3369,17 +3440,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3369
3440
  } break;
3370
3441
  case LLM_ARCH_PLAMO2:
3371
3442
  {
3443
+ // mamba parameters
3372
3444
  const uint32_t d_conv = hparams.ssm_d_conv;
3373
3445
  const uint32_t d_state = hparams.ssm_d_state;
3374
3446
  const uint32_t num_heads = hparams.ssm_dt_rank;
3375
3447
  const uint32_t intermediate_size = hparams.ssm_d_inner;
3376
- const uint32_t head_dim = intermediate_size / num_heads;
3377
- const uint32_t qk_dim = head_dim;
3378
- const uint32_t v_dim = head_dim;
3379
- const int64_t num_attention_heads = hparams.n_head();
3380
- const int64_t q_num_heads = num_attention_heads;
3381
3448
  const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3382
3449
 
3450
+ // attention parameters
3451
+ const uint32_t qk_dim = hparams.n_embd_head_k;
3452
+ const uint32_t v_dim = hparams.n_embd_head_v;
3453
+
3383
3454
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3384
3455
 
3385
3456
  // output
@@ -3413,6 +3484,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3413
3484
  layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3414
3485
  layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3415
3486
  } else {
3487
+ const int64_t num_attention_heads = hparams.n_head(i);
3488
+ const int64_t q_num_heads = num_attention_heads;
3416
3489
  const int64_t num_key_value_heads = hparams.n_head_kv(i);
3417
3490
  const int64_t k_num_heads = num_key_value_heads;
3418
3491
  const int64_t v_num_heads = num_key_value_heads;
@@ -3421,8 +3494,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3421
3494
  const int64_t v_proj_dim = v_num_heads * v_dim;
3422
3495
 
3423
3496
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3424
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
3425
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
3497
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
3498
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
3426
3499
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3427
3500
  }
3428
3501
 
@@ -3622,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3622
3695
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3623
3696
  }
3624
3697
 
3698
+ // Dense linear weights
3699
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
3700
+ dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
3701
+
3702
+
3625
3703
  for (int i = 0; i < n_layer; ++i) {
3626
3704
  auto & layer = layers[i];
3627
3705
 
@@ -4802,11 +4880,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4802
4880
  // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4803
4881
  if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4804
4882
  layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4805
- layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4806
4883
  layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4807
4884
  layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4808
- layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4809
- layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4885
+
4886
+ // Optional tensors
4887
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
4888
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
4889
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
4810
4890
  }
4811
4891
  }
4812
4892
  }
@@ -5764,6 +5844,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5764
5844
  }
5765
5845
  } break;
5766
5846
  case LLM_ARCH_LFM2:
5847
+ case LLM_ARCH_LFM2MOE:
5767
5848
  {
5768
5849
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5769
5850
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@@ -5775,11 +5856,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5775
5856
 
5776
5857
  for (int i = 0; i < n_layer; ++i) {
5777
5858
  auto & layer = layers[i];
5778
- // ffn is same for transformer and conv layers
5859
+
5860
+ const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
5861
+
5862
+ // ffn/moe is same for transformer and conv layers
5779
5863
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5780
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5781
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5782
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5864
+ if (is_moe_layer) {
5865
+ GGML_ASSERT(n_expert && n_expert_used);
5866
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5867
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
5868
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
5869
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
5870
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
5871
+ } else { // dense
5872
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5873
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5874
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5875
+ }
5783
5876
 
5784
5877
  // for operator_norm
5785
5878
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
@@ -5837,6 +5930,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5837
5930
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5838
5931
  }
5839
5932
  } break;
5933
+ case LLM_ARCH_GROVEMOE:
5934
+ {
5935
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5936
+
5937
+ // output
5938
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5939
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5940
+ // if output is NULL, init from the input tok embed
5941
+ if (output == NULL) {
5942
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5943
+ }
5944
+
5945
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
5946
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
5947
+ GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
5948
+
5949
+ for (int i = 0; i < n_layer; ++i) {
5950
+ auto & layer = layers[i];
5951
+
5952
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5953
+
5954
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5955
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5956
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5957
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5958
+
5959
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5960
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5961
+
5962
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5963
+
5964
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5965
+
5966
+ // MoE branch
5967
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5968
+ const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
5969
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
5970
+
5971
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5972
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5973
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5974
+
5975
+ layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
5976
+ layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
5977
+ layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
5978
+ }
5979
+ } break;
5980
+ case LLM_ARCH_APERTUS:
5981
+ {
5982
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5983
+
5984
+ // output
5985
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5986
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
5987
+
5988
+ for (int i = 0; i < n_layer; ++i) {
5989
+ auto & layer = layers[i];
5990
+
5991
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5992
+
5993
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
5994
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5995
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5996
+ } else {
5997
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5998
+ }
5999
+
6000
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6001
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6002
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6003
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6004
+
6005
+ // optional bias tensors
6006
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6007
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6008
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6009
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6010
+
6011
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6013
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6014
+
6015
+ // Q and K layernorms for Apertus
6016
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6017
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6018
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6019
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6020
+ }
6021
+ } break;
5840
6022
  default:
5841
6023
  throw std::runtime_error("unknown architecture");
5842
6024
  }
@@ -6171,11 +6353,18 @@ void llama_model::print_info() const {
6171
6353
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6172
6354
  }
6173
6355
 
6174
- if (arch == LLM_ARCH_SMALLTHINKER) {
6356
+ if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6175
6357
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6176
6358
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6177
6359
  }
6178
6360
 
6361
+ if (arch == LLM_ARCH_GROVEMOE) {
6362
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6363
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
6364
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
6365
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
6366
+ }
6367
+
6179
6368
  vocab.print_info();
6180
6369
  }
6181
6370
 
@@ -7699,6 +7888,8 @@ struct llm_build_bert : public llm_graph_context {
7699
7888
  }
7700
7889
 
7701
7890
  if (model.layers[il].attn_q_norm) {
7891
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
7892
+
7702
7893
  Qcur = build_norm(Qcur,
7703
7894
  model.layers[il].attn_q_norm,
7704
7895
  model.layers[il].attn_q_norm_b,
@@ -7708,6 +7899,8 @@ struct llm_build_bert : public llm_graph_context {
7708
7899
  }
7709
7900
 
7710
7901
  if (model.layers[il].attn_k_norm) {
7902
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
7903
+
7711
7904
  Kcur = build_norm(Kcur,
7712
7905
  model.layers[il].attn_k_norm,
7713
7906
  model.layers[il].attn_k_norm_b,
@@ -8090,6 +8283,9 @@ struct llm_build_mpt : public llm_graph_context {
8090
8283
 
8091
8284
  // Q/K Layernorm
8092
8285
  if (model.layers[il].attn_q_norm) {
8286
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
8287
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
8288
+
8093
8289
  Qcur = build_norm(Qcur,
8094
8290
  model.layers[il].attn_q_norm,
8095
8291
  model.layers[il].attn_q_norm_b,
@@ -11674,6 +11870,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
11674
11870
  // TODO: skip computing output earlier for unused tokens
11675
11871
 
11676
11872
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
11873
+ cb(y, "mamba2_y_add_d", il);
11677
11874
  y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
11678
11875
 
11679
11876
  // grouped RMS norm
@@ -14628,6 +14825,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14628
14825
  ggml_tensor * inpL;
14629
14826
 
14630
14827
  inpL = build_inp_embd(model.tok_embd);
14828
+ ggml_build_forward_expand(gf, inpL);
14631
14829
 
14632
14830
  auto * inp = build_inp_mem_hybrid();
14633
14831
 
@@ -14659,7 +14857,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14659
14857
 
14660
14858
  // add residual
14661
14859
  cur = ggml_add(ctx0, cur, inpSA);
14662
- cb(cur, "block_out", il);
14860
+ cb(cur, "nemotron_h_block_out", il);
14663
14861
 
14664
14862
  // input for next layer
14665
14863
  inpL = cur;
@@ -16115,10 +16313,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
16115
16313
  }
16116
16314
 
16117
16315
  ggml_tensor * build_layer_ffn(
16118
- ggml_tensor * cur,
16119
- ggml_tensor * inpSA,
16120
- const llama_model & model,
16121
- const int il) {
16316
+ ggml_tensor * cur,
16317
+ ggml_tensor * inpSA,
16318
+ const llama_model & model,
16319
+ const int il) {
16122
16320
 
16123
16321
  // For Granite architectures - scale residual
16124
16322
  if (hparams.f_residual_scale) {
@@ -17530,6 +17728,7 @@ private:
17530
17728
  const int64_t n_embd_head_q = hparams.n_embd_head_k;
17531
17729
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
17532
17730
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
17731
+ int32_t n_head = hparams.n_head(il);
17533
17732
  int32_t n_head_kv = hparams.n_head_kv(il);
17534
17733
 
17535
17734
  const int64_t q_offset = 0;
@@ -18446,6 +18645,8 @@ struct llm_build_lfm2 : public llm_graph_context {
18446
18645
  ggml_tensor * inp_out_ids = build_inp_out_ids();
18447
18646
 
18448
18647
  for (int il = 0; il < n_layer; ++il) {
18648
+ const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
18649
+
18449
18650
  auto * prev_cur = cur;
18450
18651
  cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
18451
18652
  cb(cur, "model.layers.{}.operator_norm", il);
@@ -18460,7 +18661,16 @@ struct llm_build_lfm2 : public llm_graph_context {
18460
18661
  }
18461
18662
 
18462
18663
  cur = ggml_add(ctx0, prev_cur, cur);
18463
- cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
18664
+
18665
+ auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
18666
+ cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
18667
+
18668
+ ggml_tensor * ffn_out = is_moe_layer ?
18669
+ build_moe_feed_forward(ffn_norm_out, il) :
18670
+ build_dense_feed_forward(ffn_norm_out, il);
18671
+ cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
18672
+
18673
+ cur = ggml_add(ctx0, cur, ffn_out);
18464
18674
  }
18465
18675
 
18466
18676
  cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
@@ -18475,23 +18685,32 @@ struct llm_build_lfm2 : public llm_graph_context {
18475
18685
  ggml_build_forward_expand(gf, cur);
18476
18686
  }
18477
18687
 
18478
- ggml_tensor * build_feed_forward(ggml_tensor * cur,
18479
- int il) const {
18480
- cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
18481
- cb(cur, "model.layers.{}.ffn_norm", il);
18688
+ ggml_tensor * build_moe_feed_forward(ggml_tensor * cur,
18689
+ int il) const {
18690
+ return build_moe_ffn(cur,
18691
+ model.layers[il].ffn_gate_inp,
18692
+ model.layers[il].ffn_up_exps,
18693
+ model.layers[il].ffn_gate_exps,
18694
+ model.layers[il].ffn_down_exps,
18695
+ model.layers[il].ffn_exp_probs_b,
18696
+ n_expert, n_expert_used,
18697
+ LLM_FFN_SILU, true,
18698
+ false, 0.0,
18699
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
18700
+ il);
18701
+ }
18482
18702
 
18703
+ ggml_tensor * build_dense_feed_forward(ggml_tensor * cur,
18704
+ int il) const {
18483
18705
  GGML_ASSERT(!model.layers[il].ffn_up_b);
18484
18706
  GGML_ASSERT(!model.layers[il].ffn_gate_b);
18485
18707
  GGML_ASSERT(!model.layers[il].ffn_down_b);
18486
- cur = build_ffn(cur,
18708
+ return build_ffn(cur,
18487
18709
  model.layers[il].ffn_up, NULL, NULL,
18488
18710
  model.layers[il].ffn_gate, NULL, NULL,
18489
18711
  model.layers[il].ffn_down, NULL, NULL,
18490
18712
  NULL,
18491
18713
  LLM_FFN_SILU, LLM_FFN_PAR, il);
18492
- cb(cur, "model.layers.{}.feed_forward.w2", il);
18493
-
18494
- return cur;
18495
18714
  }
18496
18715
 
18497
18716
  ggml_tensor * build_attn_block(ggml_tensor * cur,
@@ -18861,6 +19080,291 @@ struct llm_build_smallthinker : public llm_graph_context{
18861
19080
  }
18862
19081
  };
18863
19082
 
19083
+ struct llm_build_grovemoe : public llm_graph_context {
19084
+ llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
19085
+ const int64_t n_embd_head = hparams.n_embd_head_v;
19086
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
19087
+
19088
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
19089
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
19090
+
19091
+ ggml_tensor * cur;
19092
+ ggml_tensor * inpL;
19093
+
19094
+ inpL = build_inp_embd(model.tok_embd);
19095
+
19096
+ // inp_pos - contains the positions
19097
+ ggml_tensor * inp_pos = build_inp_pos();
19098
+
19099
+ auto * inp_attn = build_attn_inp_kv();
19100
+
19101
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
19102
+
19103
+ for (int il = 0; il < n_layer; ++il) {
19104
+ ggml_tensor * inpSA = inpL;
19105
+
19106
+ // norm
19107
+ cur = build_norm(inpL,
19108
+ model.layers[il].attn_norm, NULL,
19109
+ LLM_NORM_RMS, il);
19110
+ cb(cur, "attn_norm", il);
19111
+
19112
+ // self_attention
19113
+ {
19114
+ // compute Q and K and RoPE them
19115
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
19116
+ cb(Qcur, "Qcur", il);
19117
+
19118
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
19119
+ cb(Kcur, "Kcur", il);
19120
+
19121
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
19122
+ cb(Vcur, "Vcur", il);
19123
+
19124
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
19125
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
19126
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
19127
+
19128
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
19129
+ cb(Qcur, "Qcur_normed", il);
19130
+
19131
+ Qcur = ggml_rope_ext(
19132
+ ctx0, Qcur, inp_pos, nullptr,
19133
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19134
+ ext_factor, attn_factor, beta_fast, beta_slow
19135
+ );
19136
+
19137
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
19138
+ cb(Kcur, "Kcur_normed", il);
19139
+
19140
+ Kcur = ggml_rope_ext(
19141
+ ctx0, Kcur, inp_pos, nullptr,
19142
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19143
+ ext_factor, attn_factor, beta_fast, beta_slow
19144
+ );
19145
+
19146
+ cb(Qcur, "Qcur", il);
19147
+ cb(Kcur, "Kcur", il);
19148
+ cb(Vcur, "Vcur", il);
19149
+
19150
+ cur = build_attn(inp_attn,
19151
+ model.layers[il].wo, model.layers[il].bo,
19152
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
19153
+ }
19154
+
19155
+ if (il == n_layer - 1 && inp_out_ids) {
19156
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
19157
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
19158
+ }
19159
+
19160
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
19161
+ cb(ffn_inp, "ffn_inp", il);
19162
+
19163
+ // MoE branch
19164
+ cur = build_norm(ffn_inp,
19165
+ model.layers[il].ffn_norm, NULL,
19166
+ LLM_NORM_RMS, il);
19167
+ cb(cur, "ffn_norm", il);
19168
+
19169
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
19170
+ cb(probs, "ffn_moe_logits", il);
19171
+
19172
+ ggml_tensor * moe_out =
19173
+ build_moe_ffn(cur,
19174
+ nullptr,
19175
+ model.layers[il].ffn_up_exps,
19176
+ model.layers[il].ffn_gate_exps,
19177
+ model.layers[il].ffn_down_exps,
19178
+ nullptr,
19179
+ n_expert, n_expert_used,
19180
+ LLM_FFN_SILU, true,
19181
+ false, 0.0,
19182
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
19183
+ il, probs);
19184
+ cb(moe_out, "ffn_moe_out", il);
19185
+ cur = moe_out;
19186
+
19187
+ // TODO: Only do the expert selection and weights once
19188
+ moe_out =
19189
+ build_moe_ffn(cur,
19190
+ nullptr,
19191
+ model.layers[il].ffn_up_chexps,
19192
+ model.layers[il].ffn_gate_chexps,
19193
+ model.layers[il].ffn_down_chexps,
19194
+ nullptr,
19195
+ n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
19196
+ LLM_FFN_SILU, true,
19197
+ false, 0.0,
19198
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
19199
+ il, probs);
19200
+ cb(moe_out, "ffn_adj_moe_out", il);
19201
+
19202
+ cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
19203
+ cb(cur, "ffn_final_moe_out", il);
19204
+
19205
+ cur = ggml_add(ctx0, cur, ffn_inp);
19206
+
19207
+ cur = build_cvec(cur, il);
19208
+ cb(cur, "l_out", il);
19209
+
19210
+ // input for next layer
19211
+ inpL = cur;
19212
+ }
19213
+
19214
+ cur = inpL;
19215
+
19216
+ cur = build_norm(cur,
19217
+ model.output_norm, NULL,
19218
+ LLM_NORM_RMS, -1);
19219
+
19220
+ cb(cur, "result_norm", -1);
19221
+ res->t_embd = cur;
19222
+
19223
+ // lm_head
19224
+ cur = build_lora_mm(model.output, cur);
19225
+
19226
+ cb(cur, "result_output", -1);
19227
+ res->t_logits = cur;
19228
+
19229
+ ggml_build_forward_expand(gf, cur);
19230
+ }
19231
+ };
19232
+
19233
+ struct llm_build_apertus : public llm_graph_context {
19234
+ llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
19235
+ const int64_t n_embd_head = hparams.n_embd_head_v;
19236
+
19237
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
19238
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
19239
+
19240
+ ggml_tensor * cur;
19241
+ ggml_tensor * inpL;
19242
+
19243
+ inpL = build_inp_embd(model.tok_embd);
19244
+
19245
+ ggml_tensor * inp_pos = build_inp_pos();
19246
+ auto * inp_attn = build_attn_inp_kv();
19247
+
19248
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
19249
+
19250
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
19251
+
19252
+ for (int il = 0; il < n_layer; ++il) {
19253
+ ggml_tensor * inpSA = inpL;
19254
+
19255
+ cur = build_norm(inpL,
19256
+ model.layers[il].attn_norm, nullptr,
19257
+ LLM_NORM_RMS, il);
19258
+ cb(cur, "attn_norm", il);
19259
+
19260
+ // self-attention
19261
+ {
19262
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
19263
+
19264
+ // compute Q and K and RoPE them
19265
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
19266
+ cb(Qcur, "Qcur", il);
19267
+
19268
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
19269
+ cb(Kcur, "Kcur", il);
19270
+
19271
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
19272
+ cb(Vcur, "Vcur", il);
19273
+
19274
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
19275
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
19276
+ cb(Qcur, "Qcur_normed", il);
19277
+
19278
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
19279
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
19280
+ cb(Kcur, "Kcur_normed", il);
19281
+
19282
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
19283
+
19284
+ Qcur = ggml_rope_ext(
19285
+ ctx0, Qcur, inp_pos, rope_factors,
19286
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19287
+ ext_factor, attn_factor, beta_fast, beta_slow
19288
+ );
19289
+
19290
+ Kcur = ggml_rope_ext(
19291
+ ctx0, Kcur, inp_pos, rope_factors,
19292
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19293
+ ext_factor, attn_factor, beta_fast, beta_slow
19294
+ );
19295
+
19296
+ cb(Qcur, "Qcur_pos", il);
19297
+ cb(Kcur, "Kcur_pos", il);
19298
+ cb(Vcur, "Vcur_pos", il);
19299
+
19300
+ cur = build_attn(inp_attn,
19301
+ model.layers[il].wo, model.layers[il].bo,
19302
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
19303
+ cb(cur, "attn_out", il);
19304
+ }
19305
+
19306
+ if (il == n_layer - 1 && inp_out_ids) {
19307
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
19308
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
19309
+ }
19310
+
19311
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
19312
+ cb(ffn_inp, "ffn_inp", il);
19313
+
19314
+ // feed-forward network with xIELU activation
19315
+ {
19316
+ cur = build_norm(ffn_inp,
19317
+ model.layers[il].ffn_norm, nullptr,
19318
+ LLM_NORM_RMS, il);
19319
+ cb(cur, "ffn_norm", il);
19320
+
19321
+ // Up projection
19322
+ ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
19323
+ cb(up, "ffn_up", il);
19324
+
19325
+ float alpha_n_val = hparams.xielu_alpha_n[il];
19326
+ float alpha_p_val = hparams.xielu_alpha_p[il];
19327
+ float beta_val = hparams.xielu_beta[il];
19328
+ float eps_val = hparams.xielu_eps[il];
19329
+
19330
+ // Apply xIELU activation
19331
+ ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
19332
+ cb(activated, "ffn_xielu", il);
19333
+
19334
+ // Down projection
19335
+ cur = build_lora_mm(model.layers[il].ffn_down, activated);
19336
+ cb(cur, "ffn_down", il);
19337
+ }
19338
+
19339
+ cur = ggml_add(ctx0, cur, ffn_inp);
19340
+ cb(cur, "ffn_out", il);
19341
+
19342
+ cur = build_cvec(cur, il);
19343
+ cb(cur, "l_out", il);
19344
+
19345
+ // input for next layer
19346
+ inpL = cur;
19347
+ }
19348
+
19349
+ cur = inpL;
19350
+
19351
+ cur = build_norm(cur,
19352
+ model.output_norm, nullptr,
19353
+ LLM_NORM_RMS, -1);
19354
+
19355
+ cb(cur, "result_norm", -1);
19356
+ res->t_embd = cur;
19357
+
19358
+ // lm_head
19359
+ cur = build_lora_mm(model.output, cur);
19360
+
19361
+ cb(cur, "result_output", -1);
19362
+ res->t_logits = cur;
19363
+
19364
+ ggml_build_forward_expand(gf, cur);
19365
+ }
19366
+ };
19367
+
18864
19368
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
18865
19369
  llama_memory_i * res;
18866
19370
 
@@ -19376,6 +19880,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19376
19880
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
19377
19881
  } break;
19378
19882
  case LLM_ARCH_LFM2:
19883
+ case LLM_ARCH_LFM2MOE:
19379
19884
  {
19380
19885
  llm = std::make_unique<llm_build_lfm2>(*this, params);
19381
19886
  } break;
@@ -19387,6 +19892,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19387
19892
  llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
19388
19893
  }
19389
19894
  } break;
19895
+ case LLM_ARCH_GROVEMOE:
19896
+ {
19897
+ llm = std::make_unique<llm_build_grovemoe>(*this, params);
19898
+ } break;
19899
+ case LLM_ARCH_APERTUS:
19900
+ {
19901
+ llm = std::make_unique<llm_build_apertus>(*this, params);
19902
+ } break;
19390
19903
  default:
19391
19904
  GGML_ABORT("fatal error");
19392
19905
  }
@@ -19394,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19394
19907
  // add on pooling layer
19395
19908
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
19396
19909
 
19910
+ // if the gguf model was converted with --sentence-transformers-dense-modules
19911
+ // there will be two additional dense projection layers
19912
+ // dense linear projections are applied after pooling
19913
+ // TODO: move reranking logic here and generalize
19914
+ llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
19915
+
19397
19916
  return llm->res->get_gf();
19398
19917
  }
19399
19918
 
@@ -19418,6 +19937,7 @@ llama_model_params llama_model_default_params() {
19418
19937
  /*.use_mlock =*/ false,
19419
19938
  /*.check_tensors =*/ false,
19420
19939
  /*.use_extra_bufts =*/ true,
19940
+ /*.no_host =*/ false,
19421
19941
  };
19422
19942
 
19423
19943
  return result;
@@ -19589,9 +20109,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19589
20109
  case LLM_ARCH_OPENAI_MOE:
19590
20110
  case LLM_ARCH_HUNYUAN_DENSE:
19591
20111
  case LLM_ARCH_LFM2:
20112
+ case LLM_ARCH_LFM2MOE:
19592
20113
  case LLM_ARCH_SMALLTHINKER:
19593
20114
  case LLM_ARCH_GLM4_MOE:
19594
20115
  case LLM_ARCH_SEED_OSS:
20116
+ case LLM_ARCH_GROVEMOE:
20117
+ case LLM_ARCH_APERTUS:
19595
20118
  return LLAMA_ROPE_TYPE_NEOX;
19596
20119
 
19597
20120
  case LLM_ARCH_QWEN2VL:
@@ -19702,6 +20225,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
19702
20225
  return llm_arch_is_recurrent(model->arch);
19703
20226
  }
19704
20227
 
20228
+ bool llama_model_is_hybrid(const llama_model * model) {
20229
+ return llm_arch_is_hybrid(model->arch);
20230
+ }
20231
+
19705
20232
  bool llama_model_is_diffusion(const llama_model * model) {
19706
20233
  return llm_arch_is_diffusion(model->arch);
19707
20234
  }