@fugood/llama.node 1.3.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.js +18 -1
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +7 -7
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/llama.cpp/common/arg.cpp +27 -2
  9. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  10. package/src/llama.cpp/common/chat.cpp +0 -952
  11. package/src/llama.cpp/common/common.cpp +55 -0
  12. package/src/llama.cpp/common/common.h +18 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  14. package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
  15. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +12 -4
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
  28. package/src/llama.cpp/include/llama.h +18 -0
  29. package/src/llama.cpp/src/CMakeLists.txt +2 -0
  30. package/src/llama.cpp/src/llama-arch.cpp +95 -16
  31. package/src/llama.cpp/src/llama-arch.h +15 -0
  32. package/src/llama.cpp/src/llama-context.cpp +7 -3
  33. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  34. package/src/llama.cpp/src/llama-hparams.h +1 -1
  35. package/src/llama.cpp/src/llama-model.cpp +141 -6
  36. package/src/llama.cpp/src/llama-model.h +4 -0
  37. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  38. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  39. package/src/llama.cpp/src/models/models.h +55 -1
  40. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
  41. package/src/llama.cpp/src/models/rnd1.cpp +126 -0
@@ -2,7 +2,6 @@
2
2
 
3
3
  #include "llama-impl.h"
4
4
  #include "llama-mmap.h"
5
- #include "llama-batch.h"
6
5
  #include "llama-cparams.h"
7
6
  #include "llama-model-loader.h"
8
7
 
@@ -1036,6 +1035,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1036
1035
  default: type = LLM_TYPE_UNKNOWN;
1037
1036
  }
1038
1037
  } break;
1038
+ case LLM_ARCH_RND1:
1039
+ {
1040
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1041
+
1042
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1043
+ switch (hparams.n_layer) {
1044
+ case 48: type = LLM_TYPE_30B_A3B; break;
1045
+ default: type = LLM_TYPE_UNKNOWN;
1046
+ }
1047
+ // Set non-causal attention for diffusion models
1048
+ hparams.causal_attn = false;
1049
+ } break;
1039
1050
  case LLM_ARCH_QWEN2MOE:
1040
1051
  {
1041
1052
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -2213,6 +2224,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2213
2224
  default: type = LLM_TYPE_UNKNOWN;
2214
2225
  }
2215
2226
  } break;
2227
+ case LLM_ARCH_QWEN3NEXT:
2228
+ {
2229
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2230
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2231
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2232
+
2233
+ // Load linear attention (gated delta net) parameters
2234
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2235
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2236
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2237
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2238
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2239
+
2240
+ // Mark recurrent layers (linear attention layers)
2241
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2242
+ hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
2243
+ }
2244
+
2245
+ switch (hparams.n_layer) {
2246
+ case 80: type = LLM_TYPE_80B_A3B; break;
2247
+ default: type = LLM_TYPE_UNKNOWN;
2248
+ }
2249
+ } break;
2216
2250
  default: throw std::runtime_error("unsupported model architecture");
2217
2251
  }
2218
2252
 
@@ -3402,6 +3436,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3402
3436
  } break;
3403
3437
  case LLM_ARCH_QWEN3MOE:
3404
3438
  case LLM_ARCH_QWEN3VLMOE:
3439
+ case LLM_ARCH_RND1:
3405
3440
  {
3406
3441
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3407
3442
 
@@ -6120,9 +6155,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6120
6155
  case LLM_ARCH_LFM2:
6121
6156
  case LLM_ARCH_LFM2MOE:
6122
6157
  {
6123
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6124
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
6125
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6158
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6159
+
6160
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6161
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6126
6162
 
6127
6163
  if (output == NULL) {
6128
6164
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6401,6 +6437,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6401
6437
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6402
6438
  }
6403
6439
  } break;
6440
+ case LLM_ARCH_QWEN3NEXT:
6441
+ {
6442
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6443
+
6444
+ // output
6445
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6446
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
6447
+
6448
+ // if output is NULL, init from the input tok embed
6449
+ if (output == NULL) {
6450
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
6451
+ }
6452
+
6453
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6454
+
6455
+ // Calculate dimensions from hyperparameters
6456
+ const int64_t head_k_dim = hparams.ssm_d_state;
6457
+ const int64_t head_v_dim = hparams.ssm_d_state;
6458
+ const int64_t n_k_heads = hparams.ssm_n_group;
6459
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
6460
+ const int64_t key_dim = head_k_dim * n_k_heads;
6461
+ const int64_t value_dim = head_v_dim * n_v_heads;
6462
+ const int64_t conv_dim = key_dim * 2 + value_dim;
6463
+
6464
+ // Calculate projection sizes
6465
+ const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
6466
+ const int64_t ba_dim = n_v_heads * 2;
6467
+
6468
+ for (int i = 0; i < n_layer; ++i) {
6469
+ auto & layer = layers[i];
6470
+
6471
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6472
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
6473
+
6474
+ if (!hparams.is_recurrent(i)) {
6475
+ // Attention layers
6476
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
6477
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
6478
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
6479
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6480
+
6481
+ // Q/K normalization for attention layers
6482
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6483
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6484
+ } else {
6485
+ // Linear attention (gated delta net) specific tensors
6486
+ // Create tensors with calculated dimensions
6487
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6488
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6489
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6490
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
6491
+ layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
6492
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
6493
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
6494
+ }
6495
+
6496
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6497
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6498
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6499
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6500
+
6501
+ // Shared experts
6502
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
6503
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6504
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6505
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
6506
+ }
6507
+ } break;
6404
6508
  default:
6405
6509
  throw std::runtime_error("unknown architecture");
6406
6510
  }
@@ -6671,6 +6775,7 @@ void llama_model::print_info() const {
6671
6775
  arch == LLM_ARCH_FALCON_H1 ||
6672
6776
  arch == LLM_ARCH_PLAMO2 ||
6673
6777
  arch == LLM_ARCH_GRANITE_HYBRID ||
6778
+ arch == LLM_ARCH_QWEN3NEXT ||
6674
6779
  arch == LLM_ARCH_NEMOTRON_H) {
6675
6780
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6676
6781
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -6720,7 +6825,7 @@ void llama_model::print_info() const {
6720
6825
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6721
6826
  }
6722
6827
 
6723
- if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
6828
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
6724
6829
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6725
6830
  }
6726
6831
 
@@ -6882,6 +6987,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
6882
6987
  case LLM_ARCH_DREAM:
6883
6988
  case LLM_ARCH_LLADA:
6884
6989
  case LLM_ARCH_LLADA_MOE:
6990
+ case LLM_ARCH_RND1:
6885
6991
  {
6886
6992
  res = nullptr;
6887
6993
  } break;
@@ -7075,6 +7181,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7075
7181
  llm = std::make_unique<llm_build_llada_moe>(*this, params);
7076
7182
  }
7077
7183
  break;
7184
+ case LLM_ARCH_RND1:
7185
+ {
7186
+ llm = std::make_unique<llm_build_rnd1>(*this, params);
7187
+ }
7188
+ break;
7078
7189
  case LLM_ARCH_QWEN2VL:
7079
7190
  {
7080
7191
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -7406,7 +7517,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7406
7517
  case LLM_ARCH_PANGU_EMBED:
7407
7518
  {
7408
7519
  llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
7409
- }break;
7520
+ } break;
7521
+ case LLM_ARCH_QWEN3NEXT:
7522
+ {
7523
+ llm = std::make_unique<llm_build_qwen3next>(*this, params);
7524
+ } break;
7410
7525
  default:
7411
7526
  GGML_ABORT("fatal error");
7412
7527
  }
@@ -7595,6 +7710,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7595
7710
  case LLM_ARCH_QWEN3:
7596
7711
  case LLM_ARCH_QWEN3MOE:
7597
7712
  case LLM_ARCH_LLADA_MOE:
7713
+ case LLM_ARCH_RND1:
7598
7714
  case LLM_ARCH_OLMO2:
7599
7715
  case LLM_ARCH_OLMOE:
7600
7716
  case LLM_ARCH_PHI2:
@@ -7632,6 +7748,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7632
7748
  case LLM_ARCH_COGVLM:
7633
7749
  case LLM_ARCH_PANGU_EMBED:
7634
7750
  case LLM_ARCH_AFMOE:
7751
+ case LLM_ARCH_QWEN3NEXT:
7635
7752
  return LLAMA_ROPE_TYPE_NEOX;
7636
7753
 
7637
7754
  case LLM_ARCH_QWEN2VL:
@@ -7667,6 +7784,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
7667
7784
  return (int)model->gguf_kv.size();
7668
7785
  }
7669
7786
 
7787
+ const char * llama_model_meta_key_str(llama_model_meta_key key) {
7788
+ switch (key) {
7789
+ case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
7790
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
7791
+ case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
7792
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
7793
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
7794
+ case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
7795
+ case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
7796
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
7797
+ case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
7798
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
7799
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
7800
+ case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
7801
+ default: return nullptr;
7802
+ }
7803
+ }
7804
+
7670
7805
  int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
7671
7806
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
7672
7807
  if (buf_size > 0) {
@@ -113,6 +113,7 @@ enum llm_type {
113
113
  LLM_TYPE_16B_A1B,
114
114
  LLM_TYPE_21B_A3B, // Ernie MoE small
115
115
  LLM_TYPE_30B_A3B,
116
+ LLM_TYPE_80B_A3B, // Qwen3 Next
116
117
  LLM_TYPE_100B_A6B,
117
118
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
118
119
  LLM_TYPE_230B_A10B, // Minimax M2
@@ -309,6 +310,9 @@ struct llama_layer {
309
310
  struct ggml_tensor * ssm_conv1d_b = nullptr;
310
311
  struct ggml_tensor * ssm_dt_b = nullptr;
311
312
 
313
+ // qwen3next
314
+ struct ggml_tensor * ssm_beta_alpha = nullptr;
315
+
312
316
  // rwkv
313
317
  struct ggml_tensor * time_mix_w1 = nullptr;
314
318
  struct ggml_tensor * time_mix_w2 = nullptr;
@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
681
681
  }
682
682
  LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683
683
  continue;
684
- } else if (remapped_name != it.first) {
684
+ }
685
+
686
+ if (remapped_name != it.first) {
685
687
  ggml_set_name(it.second.tensor, remapped_name.c_str());
686
688
  LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
687
689
  }
@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
726
728
  {
727
729
  const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
728
730
  // attention layers have a non-zero number of kv heads
729
- int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
731
+ int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
730
732
  if (llama_model_has_encoder(&model)) {
731
- // now n_attn_layer is the number of attention layers in the encoder
733
+ // now n_layer_attn is the number of attention layers in the encoder
732
734
  // for each decoder block, there are 2 attention layers
733
- n_attn_layer += 2 * model.hparams.dec_n_layer;
735
+ n_layer_attn += 2 * model.hparams.dec_n_layer;
734
736
  }
735
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
737
+
738
+ // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
739
+ const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
740
+
741
+ LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
742
+
743
+ GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
736
744
  }
737
745
 
738
746
  size_t total_size_org = 0;
@@ -9,6 +9,8 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
9
9
  ggml_tensor * cur = build_inp_embd(model.tok_embd);
10
10
  cb(cur, "model.embed_tokens", -1);
11
11
 
12
+ ggml_build_forward_expand(gf, cur);
13
+
12
14
  ggml_tensor * inp_pos = build_inp_pos();
13
15
  auto * inp_hybrid = build_inp_mem_hybrid();
14
16
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -40,12 +42,12 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
40
42
  cur = ggml_add(ctx0, cur, ffn_out);
41
43
  }
42
44
 
43
- cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
44
- cb(cur, "model.embedding_norm", -1);
45
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
46
+ cb(cur, "result_norm", -1);
45
47
  res->t_embd = cur;
46
48
 
47
49
  cur = build_lora_mm(model.output, cur);
48
- cb(cur, "lm_head", -1);
50
+ cb(cur, "result_output", -1);
49
51
 
50
52
  res->t_logits = cur;
51
53
 
@@ -2,8 +2,9 @@
2
2
 
3
3
  #include "../llama-model.h"
4
4
  #include "../llama-graph.h"
5
- #include "../llama-memory-recurrent.h"
6
5
 
6
+ // TODO: remove in follow-up PR - move to .cpp files
7
+ #include "../llama-memory-recurrent.h"
7
8
  #include <cmath>
8
9
 
9
10
  struct llm_graph_context_mamba : public llm_graph_context {
@@ -421,7 +422,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
421
422
  struct llm_build_qwen3vlmoe : public llm_graph_context {
422
423
  llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
423
424
  };
425
+ struct llm_build_qwen3next : public llm_graph_context_mamba {
426
+ llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
427
+ private:
428
+ ggml_tensor * build_layer_attn(
429
+ llm_graph_input_attn_kv * inp_attn,
430
+ ggml_tensor * cur,
431
+ ggml_tensor * inp_pos,
432
+ int il);
433
+
434
+ ggml_tensor * build_layer_attn_linear(
435
+ llm_graph_input_rs * inp,
436
+ ggml_tensor * cur,
437
+ ggml_tensor * causal_mask,
438
+ ggml_tensor * identity,
439
+ int il);
424
440
 
441
+ ggml_tensor * build_layer_ffn(
442
+ ggml_tensor * cur,
443
+ int il);
444
+
445
+ ggml_tensor * build_delta_net_recurrent(
446
+ ggml_tensor * q,
447
+ ggml_tensor * k,
448
+ ggml_tensor * v,
449
+ ggml_tensor * g,
450
+ ggml_tensor * beta,
451
+ ggml_tensor * state,
452
+ ggml_tensor * causal_mask,
453
+ ggml_tensor * identity,
454
+ int il);
455
+
456
+ ggml_tensor * build_delta_net_chunking(
457
+ ggml_tensor * q,
458
+ ggml_tensor * k,
459
+ ggml_tensor * v,
460
+ ggml_tensor * g,
461
+ ggml_tensor * beta,
462
+ ggml_tensor * state,
463
+ ggml_tensor * causal_mask,
464
+ ggml_tensor * identity,
465
+ int il);
466
+
467
+ ggml_tensor * build_norm_gated(
468
+ ggml_tensor * input,
469
+ ggml_tensor * weights,
470
+ ggml_tensor * gate,
471
+ int layer);
472
+
473
+ const llama_model & model;
474
+ };
425
475
 
426
476
  struct llm_build_qwen : public llm_graph_context {
427
477
  llm_build_qwen(const llama_model & model, const llm_graph_params & params);
@@ -431,6 +481,10 @@ struct llm_build_refact : public llm_graph_context {
431
481
  llm_build_refact(const llama_model & model, const llm_graph_params & params);
432
482
  };
433
483
 
484
+ struct llm_build_rnd1 : public llm_graph_context {
485
+ llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
486
+ };
487
+
434
488
  struct llm_build_rwkv6 : public llm_build_rwkv6_base {
435
489
  llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
436
490
  };