@fugood/llama.node 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +423 -186
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +23 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/sampling.cpp +1 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  19. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  31. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  42. package/src/llama.cpp/include/llama.h +23 -11
  43. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  44. package/src/llama.cpp/src/llama-arch.h +22 -0
  45. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  46. package/src/llama.cpp/src/llama-context.cpp +157 -0
  47. package/src/llama.cpp/src/llama-context.h +10 -0
  48. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  49. package/src/llama.cpp/src/llama-graph.h +10 -1
  50. package/src/llama.cpp/src/llama-hparams.h +17 -2
  51. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
  52. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  53. package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
  54. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  55. package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  56. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  57. package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
  58. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  59. package/src/llama.cpp/src/llama-memory.h +3 -0
  60. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  61. package/src/llama.cpp/src/llama-model.cpp +582 -45
  62. package/src/llama.cpp/src/llama-model.h +23 -1
  63. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  64. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  65. package/src/llama.cpp/src/llama-vocab.h +41 -40
  66. package/src/llama.cpp/src/unicode.h +43 -0
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
66
66
  case LLM_TYPE_1_7B: return "1.7B";
67
67
  case LLM_TYPE_1_8B: return "1.8B";
68
68
  case LLM_TYPE_2B: return "2B";
69
+ case LLM_TYPE_2_6B: return "2.6B";
69
70
  case LLM_TYPE_2_8B: return "2.8B";
70
71
  case LLM_TYPE_2_9B: return "2.9B";
71
72
  case LLM_TYPE_3B: return "3B";
@@ -113,6 +114,7 @@ const char * llm_type_name(llm_type type) {
113
114
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
114
115
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
115
116
  case LLM_TYPE_A13B: return "A13B";
117
+ case LLM_TYPE_8B_A1B: return "8B.A1B";
116
118
  case LLM_TYPE_21B_A3B: return "21B.A3B";
117
119
  case LLM_TYPE_30B_A3B: return "30B.A3B";
118
120
  case LLM_TYPE_106B_A12B: return "106B.A12B";
@@ -309,7 +311,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
309
311
  }
310
312
 
311
313
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
312
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
314
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
313
315
  buft_list_t buft_list;
314
316
 
315
317
  // add ACCEL buffer types
@@ -330,11 +332,13 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
330
332
  // generally, this will be done using the first device in the list
331
333
  // a better approach would be to handle this on a weight-by-weight basis using the offload_op
332
334
  // function of the device to determine if it would benefit from being stored in a host buffer
333
- for (auto * dev : devices) {
334
- ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
335
- if (buft) {
336
- buft_list.emplace_back(dev, buft);
337
- break;
335
+ if (!no_host) {
336
+ for (auto * dev : devices) {
337
+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
338
+ if (buft) {
339
+ buft_list.emplace_back(dev, buft);
340
+ break;
341
+ }
338
342
  }
339
343
  }
340
344
 
@@ -511,9 +515,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
511
515
  llm_arch_is_recurrent(ml.get_arch()));
512
516
 
513
517
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
514
-
515
518
  std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
516
519
 
520
+ std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
521
+ std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
522
+ std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
523
+ std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
524
+
517
525
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
518
526
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
519
527
 
@@ -674,10 +682,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
674
682
  } break;
675
683
  case LLM_ARCH_MINICPM:
676
684
  {
685
+ // Backward-compatible defaults for older MiniCPM GGUFs
686
+ hparams.f_embedding_scale = 12.0f;
687
+ hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
688
+ hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
689
+
677
690
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
678
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
679
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
680
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
691
+
692
+ // Optional KV reads, override defaults if present in newer GGUF exports
693
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
694
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
695
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
681
696
 
682
697
  // MiniCPM uses rope by default, unlike Granite which uses it as a switch
683
698
  hparams.rope_finetuned = true;
@@ -1076,7 +1091,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1076
1091
  }
1077
1092
  break;
1078
1093
  default: type = LLM_TYPE_UNKNOWN;
1079
- }
1094
+ }
1095
+
1096
+ // Load attention parameters
1097
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1098
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1080
1099
  } break;
1081
1100
  case LLM_ARCH_GPT2:
1082
1101
  {
@@ -1199,12 +1218,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1199
1218
  hparams.set_swa_pattern(6);
1200
1219
 
1201
1220
  hparams.causal_attn = false; // embeddings do not use causal attention
1202
- hparams.rope_freq_base_train_swa = 10000.0f;
1221
+ hparams.rope_freq_base_train_swa = 10000.0f;
1203
1222
  hparams.rope_freq_scale_train_swa = 1.0f;
1204
1223
 
1205
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1224
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1206
1225
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1207
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1226
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1227
+
1228
+ //applied only if model converted with --sentence-transformers-dense-modules
1229
+ ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
1230
+ ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
1231
+ ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
1232
+ ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
1233
+
1234
+ GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
1235
+ GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
1208
1236
 
1209
1237
  switch (hparams.n_layer) {
1210
1238
  case 24: type = LLM_TYPE_0_3B; break;
@@ -1977,12 +2005,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1977
2005
  for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1978
2006
  hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1979
2007
  }
1980
- switch (hparams.n_embd) {
1981
- case 1024: type = LLM_TYPE_350M; break;
1982
- case 1536: type = LLM_TYPE_700M; break;
1983
- case 2048: type = LLM_TYPE_1_2B; break;
1984
- default: type = LLM_TYPE_UNKNOWN;
2008
+ hparams.n_layer_dense_lead = hparams.n_layer;
2009
+ switch (hparams.n_ff()) {
2010
+ case 4608: type = LLM_TYPE_350M; break;
2011
+ case 6912: type = LLM_TYPE_700M; break;
2012
+ case 8192: type = LLM_TYPE_1_2B; break;
2013
+ case 10752: type = LLM_TYPE_2_6B; break;
2014
+ default: type = LLM_TYPE_UNKNOWN;
2015
+ }
2016
+ } break;
2017
+ case LLM_ARCH_LFM2MOE:
2018
+ {
2019
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
2020
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2021
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2022
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2023
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2024
+
2025
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2026
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1985
2027
  }
2028
+
2029
+ type = LLM_TYPE_8B_A1B;
1986
2030
  } break;
1987
2031
  case LLM_ARCH_SMALLTHINKER:
1988
2032
  {
@@ -2007,6 +2051,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2007
2051
  default: type = LLM_TYPE_UNKNOWN;
2008
2052
  }
2009
2053
  } break;
2054
+ case LLM_ARCH_GROVEMOE:
2055
+ {
2056
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2057
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
2058
+ ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
2059
+ ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
2060
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2061
+
2062
+ switch (hparams.n_layer) {
2063
+ case 48: type = LLM_TYPE_30B_A3B; break;
2064
+ default: type = LLM_TYPE_UNKNOWN;
2065
+ }
2066
+ } break;
2067
+ case LLM_ARCH_APERTUS:
2068
+ {
2069
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2070
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
2071
+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
2072
+ ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
2073
+ ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
2074
+
2075
+ switch (hparams.n_layer) {
2076
+ case 32: type = LLM_TYPE_8B; break;
2077
+ default: type = LLM_TYPE_UNKNOWN;
2078
+ }
2079
+ } break;
2010
2080
  default: throw std::runtime_error("unsupported model architecture");
2011
2081
  }
2012
2082
 
@@ -2040,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2040
2110
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2041
2111
 
2042
2112
  // build a list of buffer types for the CPU and GPU devices
2043
- pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
2113
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
2044
2114
  for (auto * dev : devices) {
2045
2115
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
2046
2116
  // add CPU buffer types as a fallback
@@ -3165,6 +3235,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3165
3235
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3166
3236
  }
3167
3237
 
3238
+ // output rerank head
3239
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3240
+
3168
3241
  for (int i = 0; i < n_layer; ++i) {
3169
3242
  auto & layer = layers[i];
3170
3243
 
@@ -3367,17 +3440,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3367
3440
  } break;
3368
3441
  case LLM_ARCH_PLAMO2:
3369
3442
  {
3443
+ // mamba parameters
3370
3444
  const uint32_t d_conv = hparams.ssm_d_conv;
3371
3445
  const uint32_t d_state = hparams.ssm_d_state;
3372
3446
  const uint32_t num_heads = hparams.ssm_dt_rank;
3373
3447
  const uint32_t intermediate_size = hparams.ssm_d_inner;
3374
- const uint32_t head_dim = intermediate_size / num_heads;
3375
- const uint32_t qk_dim = head_dim;
3376
- const uint32_t v_dim = head_dim;
3377
- const int64_t num_attention_heads = hparams.n_head();
3378
- const int64_t q_num_heads = num_attention_heads;
3379
3448
  const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3380
3449
 
3450
+ // attention parameters
3451
+ const uint32_t qk_dim = hparams.n_embd_head_k;
3452
+ const uint32_t v_dim = hparams.n_embd_head_v;
3453
+
3381
3454
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3382
3455
 
3383
3456
  // output
@@ -3411,6 +3484,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3411
3484
  layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3412
3485
  layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3413
3486
  } else {
3487
+ const int64_t num_attention_heads = hparams.n_head(i);
3488
+ const int64_t q_num_heads = num_attention_heads;
3414
3489
  const int64_t num_key_value_heads = hparams.n_head_kv(i);
3415
3490
  const int64_t k_num_heads = num_key_value_heads;
3416
3491
  const int64_t v_num_heads = num_key_value_heads;
@@ -3419,8 +3494,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3419
3494
  const int64_t v_proj_dim = v_num_heads * v_dim;
3420
3495
 
3421
3496
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3422
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
3423
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
3497
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
3498
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
3424
3499
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3425
3500
  }
3426
3501
 
@@ -3620,6 +3695,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3620
3695
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3621
3696
  }
3622
3697
 
3698
+ // Dense linear weights
3699
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
3700
+ dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
3701
+
3702
+
3623
3703
  for (int i = 0; i < n_layer; ++i) {
3624
3704
  auto & layer = layers[i];
3625
3705
 
@@ -4800,11 +4880,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4800
4880
  // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4801
4881
  if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4802
4882
  layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4803
- layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4804
4883
  layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4805
4884
  layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4806
- layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4807
- layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4885
+
4886
+ // Optional tensors
4887
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
4888
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
4889
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
4808
4890
  }
4809
4891
  }
4810
4892
  }
@@ -5762,6 +5844,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5762
5844
  }
5763
5845
  } break;
5764
5846
  case LLM_ARCH_LFM2:
5847
+ case LLM_ARCH_LFM2MOE:
5765
5848
  {
5766
5849
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5767
5850
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@@ -5773,11 +5856,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5773
5856
 
5774
5857
  for (int i = 0; i < n_layer; ++i) {
5775
5858
  auto & layer = layers[i];
5776
- // ffn is same for transformer and conv layers
5859
+
5860
+ const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
5861
+
5862
+ // ffn/moe is same for transformer and conv layers
5777
5863
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5778
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5779
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5780
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5864
+ if (is_moe_layer) {
5865
+ GGML_ASSERT(n_expert && n_expert_used);
5866
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5867
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
5868
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp, n_embd, n_expert}, 0);
5869
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
5870
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
5871
+ } else { // dense
5872
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5873
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5874
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5875
+ }
5781
5876
 
5782
5877
  // for operator_norm
5783
5878
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
@@ -5835,6 +5930,95 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5835
5930
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5836
5931
  }
5837
5932
  } break;
5933
+ case LLM_ARCH_GROVEMOE:
5934
+ {
5935
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5936
+
5937
+ // output
5938
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5939
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5940
+ // if output is NULL, init from the input tok embed
5941
+ if (output == NULL) {
5942
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5943
+ }
5944
+
5945
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
5946
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
5947
+ GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
5948
+
5949
+ for (int i = 0; i < n_layer; ++i) {
5950
+ auto & layer = layers[i];
5951
+
5952
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5953
+
5954
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5955
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5956
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5957
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5958
+
5959
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5960
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5961
+
5962
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5963
+
5964
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5965
+
5966
+ // MoE branch
5967
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
5968
+ const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
5969
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
5970
+
5971
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5972
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5973
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5974
+
5975
+ layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
5976
+ layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0);
5977
+ layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0);
5978
+ }
5979
+ } break;
5980
+ case LLM_ARCH_APERTUS:
5981
+ {
5982
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5983
+
5984
+ // output
5985
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5986
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
5987
+
5988
+ for (int i = 0; i < n_layer; ++i) {
5989
+ auto & layer = layers[i];
5990
+
5991
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5992
+
5993
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
5994
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5995
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5996
+ } else {
5997
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5998
+ }
5999
+
6000
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
6001
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
6002
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
6003
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6004
+
6005
+ // optional bias tensors
6006
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6007
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6008
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
6009
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
6010
+
6011
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
6012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
6013
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
6014
+
6015
+ // Q and K layernorms for Apertus
6016
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6017
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6018
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6019
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
6020
+ }
6021
+ } break;
5838
6022
  default:
5839
6023
  throw std::runtime_error("unknown architecture");
5840
6024
  }
@@ -6003,6 +6187,14 @@ size_t llama_model::n_devices() const {
6003
6187
  return devices.size();
6004
6188
  }
6005
6189
 
6190
+ std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6191
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
6192
+ for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6193
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6194
+ }
6195
+ return ret;
6196
+ }
6197
+
6006
6198
  uint64_t llama_model::n_elements() const {
6007
6199
  return pimpl->n_elements;
6008
6200
  }
@@ -6161,11 +6353,18 @@ void llama_model::print_info() const {
6161
6353
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6162
6354
  }
6163
6355
 
6164
- if (arch == LLM_ARCH_SMALLTHINKER) {
6356
+ if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6165
6357
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6166
6358
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6167
6359
  }
6168
6360
 
6361
+ if (arch == LLM_ARCH_GROVEMOE) {
6362
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6363
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
6364
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
6365
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
6366
+ }
6367
+
6169
6368
  vocab.print_info();
6170
6369
  }
6171
6370
 
@@ -7689,6 +7888,8 @@ struct llm_build_bert : public llm_graph_context {
7689
7888
  }
7690
7889
 
7691
7890
  if (model.layers[il].attn_q_norm) {
7891
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
7892
+
7692
7893
  Qcur = build_norm(Qcur,
7693
7894
  model.layers[il].attn_q_norm,
7694
7895
  model.layers[il].attn_q_norm_b,
@@ -7698,6 +7899,8 @@ struct llm_build_bert : public llm_graph_context {
7698
7899
  }
7699
7900
 
7700
7901
  if (model.layers[il].attn_k_norm) {
7902
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
7903
+
7701
7904
  Kcur = build_norm(Kcur,
7702
7905
  model.layers[il].attn_k_norm,
7703
7906
  model.layers[il].attn_k_norm_b,
@@ -8080,6 +8283,9 @@ struct llm_build_mpt : public llm_graph_context {
8080
8283
 
8081
8284
  // Q/K Layernorm
8082
8285
  if (model.layers[il].attn_q_norm) {
8286
+ Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head*n_head, n_tokens);
8287
+ Kcur = ggml_reshape_2d(ctx0, Kcur, n_embd_head*n_head_kv, n_tokens);
8288
+
8083
8289
  Qcur = build_norm(Qcur,
8084
8290
  model.layers[il].attn_q_norm,
8085
8291
  model.layers[il].attn_q_norm_b,
@@ -11664,6 +11870,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
11664
11870
  // TODO: skip computing output earlier for unused tokens
11665
11871
 
11666
11872
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
11873
+ cb(y, "mamba2_y_add_d", il);
11667
11874
  y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
11668
11875
 
11669
11876
  // grouped RMS norm
@@ -14618,6 +14825,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14618
14825
  ggml_tensor * inpL;
14619
14826
 
14620
14827
  inpL = build_inp_embd(model.tok_embd);
14828
+ ggml_build_forward_expand(gf, inpL);
14621
14829
 
14622
14830
  auto * inp = build_inp_mem_hybrid();
14623
14831
 
@@ -14649,7 +14857,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14649
14857
 
14650
14858
  // add residual
14651
14859
  cur = ggml_add(ctx0, cur, inpSA);
14652
- cb(cur, "block_out", il);
14860
+ cb(cur, "nemotron_h_block_out", il);
14653
14861
 
14654
14862
  // input for next layer
14655
14863
  inpL = cur;
@@ -17520,6 +17728,7 @@ private:
17520
17728
  const int64_t n_embd_head_q = hparams.n_embd_head_k;
17521
17729
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
17522
17730
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
17731
+ int32_t n_head = hparams.n_head(il);
17523
17732
  int32_t n_head_kv = hparams.n_head_kv(il);
17524
17733
 
17525
17734
  const int64_t q_offset = 0;
@@ -18436,6 +18645,8 @@ struct llm_build_lfm2 : public llm_graph_context {
18436
18645
  ggml_tensor * inp_out_ids = build_inp_out_ids();
18437
18646
 
18438
18647
  for (int il = 0; il < n_layer; ++il) {
18648
+ const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);
18649
+
18439
18650
  auto * prev_cur = cur;
18440
18651
  cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
18441
18652
  cb(cur, "model.layers.{}.operator_norm", il);
@@ -18450,7 +18661,16 @@ struct llm_build_lfm2 : public llm_graph_context {
18450
18661
  }
18451
18662
 
18452
18663
  cur = ggml_add(ctx0, prev_cur, cur);
18453
- cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
18664
+
18665
+ auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
18666
+ cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);
18667
+
18668
+ ggml_tensor * ffn_out = is_moe_layer ?
18669
+ build_moe_feed_forward(ffn_norm_out, il) :
18670
+ build_dense_feed_forward(ffn_norm_out, il);
18671
+ cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
18672
+
18673
+ cur = ggml_add(ctx0, cur, ffn_out);
18454
18674
  }
18455
18675
 
18456
18676
  cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
@@ -18465,23 +18685,32 @@ struct llm_build_lfm2 : public llm_graph_context {
18465
18685
  ggml_build_forward_expand(gf, cur);
18466
18686
  }
18467
18687
 
18468
- ggml_tensor * build_feed_forward(ggml_tensor * cur,
18469
- int il) const {
18470
- cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
18471
- cb(cur, "model.layers.{}.ffn_norm", il);
18688
+ ggml_tensor * build_moe_feed_forward(ggml_tensor * cur,
18689
+ int il) const {
18690
+ return build_moe_ffn(cur,
18691
+ model.layers[il].ffn_gate_inp,
18692
+ model.layers[il].ffn_up_exps,
18693
+ model.layers[il].ffn_gate_exps,
18694
+ model.layers[il].ffn_down_exps,
18695
+ model.layers[il].ffn_exp_probs_b,
18696
+ n_expert, n_expert_used,
18697
+ LLM_FFN_SILU, true,
18698
+ false, 0.0,
18699
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
18700
+ il);
18701
+ }
18472
18702
 
18703
+ ggml_tensor * build_dense_feed_forward(ggml_tensor * cur,
18704
+ int il) const {
18473
18705
  GGML_ASSERT(!model.layers[il].ffn_up_b);
18474
18706
  GGML_ASSERT(!model.layers[il].ffn_gate_b);
18475
18707
  GGML_ASSERT(!model.layers[il].ffn_down_b);
18476
- cur = build_ffn(cur,
18708
+ return build_ffn(cur,
18477
18709
  model.layers[il].ffn_up, NULL, NULL,
18478
18710
  model.layers[il].ffn_gate, NULL, NULL,
18479
18711
  model.layers[il].ffn_down, NULL, NULL,
18480
18712
  NULL,
18481
18713
  LLM_FFN_SILU, LLM_FFN_PAR, il);
18482
- cb(cur, "model.layers.{}.feed_forward.w2", il);
18483
-
18484
- return cur;
18485
18714
  }
18486
18715
 
18487
18716
  ggml_tensor * build_attn_block(ggml_tensor * cur,
@@ -18851,6 +19080,291 @@ struct llm_build_smallthinker : public llm_graph_context{
18851
19080
  }
18852
19081
  };
18853
19082
 
19083
+ struct llm_build_grovemoe : public llm_graph_context {
19084
+ llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
19085
+ const int64_t n_embd_head = hparams.n_embd_head_v;
19086
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
19087
+
19088
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
19089
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
19090
+
19091
+ ggml_tensor * cur;
19092
+ ggml_tensor * inpL;
19093
+
19094
+ inpL = build_inp_embd(model.tok_embd);
19095
+
19096
+ // inp_pos - contains the positions
19097
+ ggml_tensor * inp_pos = build_inp_pos();
19098
+
19099
+ auto * inp_attn = build_attn_inp_kv();
19100
+
19101
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
19102
+
19103
+ for (int il = 0; il < n_layer; ++il) {
19104
+ ggml_tensor * inpSA = inpL;
19105
+
19106
+ // norm
19107
+ cur = build_norm(inpL,
19108
+ model.layers[il].attn_norm, NULL,
19109
+ LLM_NORM_RMS, il);
19110
+ cb(cur, "attn_norm", il);
19111
+
19112
+ // self_attention
19113
+ {
19114
+ // compute Q and K and RoPE them
19115
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
19116
+ cb(Qcur, "Qcur", il);
19117
+
19118
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
19119
+ cb(Kcur, "Kcur", il);
19120
+
19121
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
19122
+ cb(Vcur, "Vcur", il);
19123
+
19124
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
19125
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
19126
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
19127
+
19128
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
19129
+ cb(Qcur, "Qcur_normed", il);
19130
+
19131
+ Qcur = ggml_rope_ext(
19132
+ ctx0, Qcur, inp_pos, nullptr,
19133
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19134
+ ext_factor, attn_factor, beta_fast, beta_slow
19135
+ );
19136
+
19137
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
19138
+ cb(Kcur, "Kcur_normed", il);
19139
+
19140
+ Kcur = ggml_rope_ext(
19141
+ ctx0, Kcur, inp_pos, nullptr,
19142
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19143
+ ext_factor, attn_factor, beta_fast, beta_slow
19144
+ );
19145
+
19146
+ cb(Qcur, "Qcur", il);
19147
+ cb(Kcur, "Kcur", il);
19148
+ cb(Vcur, "Vcur", il);
19149
+
19150
+ cur = build_attn(inp_attn,
19151
+ model.layers[il].wo, model.layers[il].bo,
19152
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
19153
+ }
19154
+
19155
+ if (il == n_layer - 1 && inp_out_ids) {
19156
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
19157
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
19158
+ }
19159
+
19160
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
19161
+ cb(ffn_inp, "ffn_inp", il);
19162
+
19163
+ // MoE branch
19164
+ cur = build_norm(ffn_inp,
19165
+ model.layers[il].ffn_norm, NULL,
19166
+ LLM_NORM_RMS, il);
19167
+ cb(cur, "ffn_norm", il);
19168
+
19169
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
19170
+ cb(probs, "ffn_moe_logits", il);
19171
+
19172
+ ggml_tensor * moe_out =
19173
+ build_moe_ffn(cur,
19174
+ nullptr,
19175
+ model.layers[il].ffn_up_exps,
19176
+ model.layers[il].ffn_gate_exps,
19177
+ model.layers[il].ffn_down_exps,
19178
+ nullptr,
19179
+ n_expert, n_expert_used,
19180
+ LLM_FFN_SILU, true,
19181
+ false, 0.0,
19182
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
19183
+ il, probs);
19184
+ cb(moe_out, "ffn_moe_out", il);
19185
+ cur = moe_out;
19186
+
19187
+ // TODO: Only do the expert selection and weights once
19188
+ moe_out =
19189
+ build_moe_ffn(cur,
19190
+ nullptr,
19191
+ model.layers[il].ffn_up_chexps,
19192
+ model.layers[il].ffn_gate_chexps,
19193
+ model.layers[il].ffn_down_chexps,
19194
+ nullptr,
19195
+ n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
19196
+ LLM_FFN_SILU, true,
19197
+ false, 0.0,
19198
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
19199
+ il, probs);
19200
+ cb(moe_out, "ffn_adj_moe_out", il);
19201
+
19202
+ cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
19203
+ cb(cur, "ffn_final_moe_out", il);
19204
+
19205
+ cur = ggml_add(ctx0, cur, ffn_inp);
19206
+
19207
+ cur = build_cvec(cur, il);
19208
+ cb(cur, "l_out", il);
19209
+
19210
+ // input for next layer
19211
+ inpL = cur;
19212
+ }
19213
+
19214
+ cur = inpL;
19215
+
19216
+ cur = build_norm(cur,
19217
+ model.output_norm, NULL,
19218
+ LLM_NORM_RMS, -1);
19219
+
19220
+ cb(cur, "result_norm", -1);
19221
+ res->t_embd = cur;
19222
+
19223
+ // lm_head
19224
+ cur = build_lora_mm(model.output, cur);
19225
+
19226
+ cb(cur, "result_output", -1);
19227
+ res->t_logits = cur;
19228
+
19229
+ ggml_build_forward_expand(gf, cur);
19230
+ }
19231
+ };
19232
+
19233
+ struct llm_build_apertus : public llm_graph_context {
19234
+ llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
19235
+ const int64_t n_embd_head = hparams.n_embd_head_v;
19236
+
19237
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
19238
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
19239
+
19240
+ ggml_tensor * cur;
19241
+ ggml_tensor * inpL;
19242
+
19243
+ inpL = build_inp_embd(model.tok_embd);
19244
+
19245
+ ggml_tensor * inp_pos = build_inp_pos();
19246
+ auto * inp_attn = build_attn_inp_kv();
19247
+
19248
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
19249
+
19250
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
19251
+
19252
+ for (int il = 0; il < n_layer; ++il) {
19253
+ ggml_tensor * inpSA = inpL;
19254
+
19255
+ cur = build_norm(inpL,
19256
+ model.layers[il].attn_norm, nullptr,
19257
+ LLM_NORM_RMS, il);
19258
+ cb(cur, "attn_norm", il);
19259
+
19260
+ // self-attention
19261
+ {
19262
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
19263
+
19264
+ // compute Q and K and RoPE them
19265
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
19266
+ cb(Qcur, "Qcur", il);
19267
+
19268
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
19269
+ cb(Kcur, "Kcur", il);
19270
+
19271
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
19272
+ cb(Vcur, "Vcur", il);
19273
+
19274
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
19275
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
19276
+ cb(Qcur, "Qcur_normed", il);
19277
+
19278
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
19279
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
19280
+ cb(Kcur, "Kcur_normed", il);
19281
+
19282
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
19283
+
19284
+ Qcur = ggml_rope_ext(
19285
+ ctx0, Qcur, inp_pos, rope_factors,
19286
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19287
+ ext_factor, attn_factor, beta_fast, beta_slow
19288
+ );
19289
+
19290
+ Kcur = ggml_rope_ext(
19291
+ ctx0, Kcur, inp_pos, rope_factors,
19292
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
19293
+ ext_factor, attn_factor, beta_fast, beta_slow
19294
+ );
19295
+
19296
+ cb(Qcur, "Qcur_pos", il);
19297
+ cb(Kcur, "Kcur_pos", il);
19298
+ cb(Vcur, "Vcur_pos", il);
19299
+
19300
+ cur = build_attn(inp_attn,
19301
+ model.layers[il].wo, model.layers[il].bo,
19302
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
19303
+ cb(cur, "attn_out", il);
19304
+ }
19305
+
19306
+ if (il == n_layer - 1 && inp_out_ids) {
19307
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
19308
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
19309
+ }
19310
+
19311
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
19312
+ cb(ffn_inp, "ffn_inp", il);
19313
+
19314
+ // feed-forward network with xIELU activation
19315
+ {
19316
+ cur = build_norm(ffn_inp,
19317
+ model.layers[il].ffn_norm, nullptr,
19318
+ LLM_NORM_RMS, il);
19319
+ cb(cur, "ffn_norm", il);
19320
+
19321
+ // Up projection
19322
+ ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
19323
+ cb(up, "ffn_up", il);
19324
+
19325
+ float alpha_n_val = hparams.xielu_alpha_n[il];
19326
+ float alpha_p_val = hparams.xielu_alpha_p[il];
19327
+ float beta_val = hparams.xielu_beta[il];
19328
+ float eps_val = hparams.xielu_eps[il];
19329
+
19330
+ // Apply xIELU activation
19331
+ ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
19332
+ cb(activated, "ffn_xielu", il);
19333
+
19334
+ // Down projection
19335
+ cur = build_lora_mm(model.layers[il].ffn_down, activated);
19336
+ cb(cur, "ffn_down", il);
19337
+ }
19338
+
19339
+ cur = ggml_add(ctx0, cur, ffn_inp);
19340
+ cb(cur, "ffn_out", il);
19341
+
19342
+ cur = build_cvec(cur, il);
19343
+ cb(cur, "l_out", il);
19344
+
19345
+ // input for next layer
19346
+ inpL = cur;
19347
+ }
19348
+
19349
+ cur = inpL;
19350
+
19351
+ cur = build_norm(cur,
19352
+ model.output_norm, nullptr,
19353
+ LLM_NORM_RMS, -1);
19354
+
19355
+ cb(cur, "result_norm", -1);
19356
+ res->t_embd = cur;
19357
+
19358
+ // lm_head
19359
+ cur = build_lora_mm(model.output, cur);
19360
+
19361
+ cb(cur, "result_output", -1);
19362
+ res->t_logits = cur;
19363
+
19364
+ ggml_build_forward_expand(gf, cur);
19365
+ }
19366
+ };
19367
+
18854
19368
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
18855
19369
  llama_memory_i * res;
18856
19370
 
@@ -19366,6 +19880,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19366
19880
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
19367
19881
  } break;
19368
19882
  case LLM_ARCH_LFM2:
19883
+ case LLM_ARCH_LFM2MOE:
19369
19884
  {
19370
19885
  llm = std::make_unique<llm_build_lfm2>(*this, params);
19371
19886
  } break;
@@ -19377,6 +19892,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19377
19892
  llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
19378
19893
  }
19379
19894
  } break;
19895
+ case LLM_ARCH_GROVEMOE:
19896
+ {
19897
+ llm = std::make_unique<llm_build_grovemoe>(*this, params);
19898
+ } break;
19899
+ case LLM_ARCH_APERTUS:
19900
+ {
19901
+ llm = std::make_unique<llm_build_apertus>(*this, params);
19902
+ } break;
19380
19903
  default:
19381
19904
  GGML_ABORT("fatal error");
19382
19905
  }
@@ -19384,6 +19907,12 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19384
19907
  // add on pooling layer
19385
19908
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
19386
19909
 
19910
+ // if the gguf model was converted with --sentence-transformers-dense-modules
19911
+ // there will be two additional dense projection layers
19912
+ // dense linear projections are applied after pooling
19913
+ // TODO: move reranking logic here and generalize
19914
+ llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
19915
+
19387
19916
  return llm->res->get_gf();
19388
19917
  }
19389
19918
 
@@ -19408,6 +19937,7 @@ llama_model_params llama_model_default_params() {
19408
19937
  /*.use_mlock =*/ false,
19409
19938
  /*.check_tensors =*/ false,
19410
19939
  /*.use_extra_bufts =*/ true,
19940
+ /*.no_host =*/ false,
19411
19941
  };
19412
19942
 
19413
19943
  return result;
@@ -19579,9 +20109,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19579
20109
  case LLM_ARCH_OPENAI_MOE:
19580
20110
  case LLM_ARCH_HUNYUAN_DENSE:
19581
20111
  case LLM_ARCH_LFM2:
20112
+ case LLM_ARCH_LFM2MOE:
19582
20113
  case LLM_ARCH_SMALLTHINKER:
19583
20114
  case LLM_ARCH_GLM4_MOE:
19584
20115
  case LLM_ARCH_SEED_OSS:
20116
+ case LLM_ARCH_GROVEMOE:
20117
+ case LLM_ARCH_APERTUS:
19585
20118
  return LLAMA_ROPE_TYPE_NEOX;
19586
20119
 
19587
20120
  case LLM_ARCH_QWEN2VL:
@@ -19692,6 +20225,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
19692
20225
  return llm_arch_is_recurrent(model->arch);
19693
20226
  }
19694
20227
 
20228
+ bool llama_model_is_hybrid(const llama_model * model) {
20229
+ return llm_arch_is_hybrid(model->arch);
20230
+ }
20231
+
19695
20232
  bool llama_model_is_diffusion(const llama_model * model) {
19696
20233
  return llm_arch_is_diffusion(model->arch);
19697
20234
  }