@fugood/llama.node 1.3.8 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/lib/binding.js +25 -18
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +17 -17
  6. package/scripts/llama.cpp.patch +53 -4
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/LlamaContext.cpp +6 -1
  9. package/src/llama.cpp/common/arg.cpp +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  11. package/src/llama.cpp/common/chat.cpp +0 -952
  12. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  13. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  14. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  22. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  23. package/src/llama.cpp/src/llama-arch.cpp +48 -3
  24. package/src/llama.cpp/src/llama-arch.h +2 -0
  25. package/src/llama.cpp/src/llama-context.cpp +6 -2
  26. package/src/llama.cpp/src/llama-hparams.h +1 -1
  27. package/src/llama.cpp/src/llama-model.cpp +102 -5
  28. package/src/llama.cpp/src/llama-model.h +4 -0
  29. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  30. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  31. package/src/llama.cpp/src/models/models.h +51 -1
  32. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
@@ -2,7 +2,6 @@
2
2
 
3
3
  #include "llama-impl.h"
4
4
  #include "llama-mmap.h"
5
- #include "llama-batch.h"
6
5
  #include "llama-cparams.h"
7
6
  #include "llama-model-loader.h"
8
7
 
@@ -2225,6 +2224,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2225
2224
  default: type = LLM_TYPE_UNKNOWN;
2226
2225
  }
2227
2226
  } break;
2227
+ case LLM_ARCH_QWEN3NEXT:
2228
+ {
2229
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2230
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2231
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2232
+
2233
+ // Load linear attention (gated delta net) parameters
2234
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2235
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2236
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2237
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2238
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2239
+
2240
+ // Mark recurrent layers (linear attention layers)
2241
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2242
+ hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
2243
+ }
2244
+
2245
+ switch (hparams.n_layer) {
2246
+ case 80: type = LLM_TYPE_80B_A3B; break;
2247
+ default: type = LLM_TYPE_UNKNOWN;
2248
+ }
2249
+ } break;
2228
2250
  default: throw std::runtime_error("unsupported model architecture");
2229
2251
  }
2230
2252
 
@@ -6133,9 +6155,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6133
6155
  case LLM_ARCH_LFM2:
6134
6156
  case LLM_ARCH_LFM2MOE:
6135
6157
  {
6136
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6137
- tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
6138
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6158
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6159
+
6160
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6161
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
6139
6162
 
6140
6163
  if (output == NULL) {
6141
6164
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6414,6 +6437,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6414
6437
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6415
6438
  }
6416
6439
  } break;
6440
+ case LLM_ARCH_QWEN3NEXT:
6441
+ {
6442
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6443
+
6444
+ // output
6445
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
6446
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
6447
+
6448
+ // if output is NULL, init from the input tok embed
6449
+ if (output == NULL) {
6450
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
6451
+ }
6452
+
6453
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
6454
+
6455
+ // Calculate dimensions from hyperparameters
6456
+ const int64_t head_k_dim = hparams.ssm_d_state;
6457
+ const int64_t head_v_dim = hparams.ssm_d_state;
6458
+ const int64_t n_k_heads = hparams.ssm_n_group;
6459
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
6460
+ const int64_t key_dim = head_k_dim * n_k_heads;
6461
+ const int64_t value_dim = head_v_dim * n_v_heads;
6462
+ const int64_t conv_dim = key_dim * 2 + value_dim;
6463
+
6464
+ // Calculate projection sizes
6465
+ const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
6466
+ const int64_t ba_dim = n_v_heads * 2;
6467
+
6468
+ for (int i = 0; i < n_layer; ++i) {
6469
+ auto & layer = layers[i];
6470
+
6471
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6472
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
6473
+
6474
+ if (!hparams.is_recurrent(i)) {
6475
+ // Attention layers
6476
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
6477
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
6478
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
6479
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
6480
+
6481
+ // Q/K normalization for attention layers
6482
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
6483
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
6484
+ } else {
6485
+ // Linear attention (gated delta net) specific tensors
6486
+ // Create tensors with calculated dimensions
6487
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6488
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6489
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6490
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
6491
+ layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
6492
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
6493
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
6494
+ }
6495
+
6496
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6497
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6498
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6499
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6500
+
6501
+ // Shared experts
6502
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
6503
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6504
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6505
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
6506
+ }
6507
+ } break;
6417
6508
  default:
6418
6509
  throw std::runtime_error("unknown architecture");
6419
6510
  }
@@ -6684,6 +6775,7 @@ void llama_model::print_info() const {
6684
6775
  arch == LLM_ARCH_FALCON_H1 ||
6685
6776
  arch == LLM_ARCH_PLAMO2 ||
6686
6777
  arch == LLM_ARCH_GRANITE_HYBRID ||
6778
+ arch == LLM_ARCH_QWEN3NEXT ||
6687
6779
  arch == LLM_ARCH_NEMOTRON_H) {
6688
6780
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
6689
6781
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
@@ -7425,7 +7517,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7425
7517
  case LLM_ARCH_PANGU_EMBED:
7426
7518
  {
7427
7519
  llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
7428
- }break;
7520
+ } break;
7521
+ case LLM_ARCH_QWEN3NEXT:
7522
+ {
7523
+ llm = std::make_unique<llm_build_qwen3next>(*this, params);
7524
+ } break;
7429
7525
  default:
7430
7526
  GGML_ABORT("fatal error");
7431
7527
  }
@@ -7652,6 +7748,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7652
7748
  case LLM_ARCH_COGVLM:
7653
7749
  case LLM_ARCH_PANGU_EMBED:
7654
7750
  case LLM_ARCH_AFMOE:
7751
+ case LLM_ARCH_QWEN3NEXT:
7655
7752
  return LLAMA_ROPE_TYPE_NEOX;
7656
7753
 
7657
7754
  case LLM_ARCH_QWEN2VL:
@@ -113,6 +113,7 @@ enum llm_type {
113
113
  LLM_TYPE_16B_A1B,
114
114
  LLM_TYPE_21B_A3B, // Ernie MoE small
115
115
  LLM_TYPE_30B_A3B,
116
+ LLM_TYPE_80B_A3B, // Qwen3 Next
116
117
  LLM_TYPE_100B_A6B,
117
118
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
118
119
  LLM_TYPE_230B_A10B, // Minimax M2
@@ -309,6 +310,9 @@ struct llama_layer {
309
310
  struct ggml_tensor * ssm_conv1d_b = nullptr;
310
311
  struct ggml_tensor * ssm_dt_b = nullptr;
311
312
 
313
+ // qwen3next
314
+ struct ggml_tensor * ssm_beta_alpha = nullptr;
315
+
312
316
  // rwkv
313
317
  struct ggml_tensor * time_mix_w1 = nullptr;
314
318
  struct ggml_tensor * time_mix_w2 = nullptr;
@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
681
681
  }
682
682
  LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683
683
  continue;
684
- } else if (remapped_name != it.first) {
684
+ }
685
+
686
+ if (remapped_name != it.first) {
685
687
  ggml_set_name(it.second.tensor, remapped_name.c_str());
686
688
  LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
687
689
  }
@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
726
728
  {
727
729
  const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
728
730
  // attention layers have a non-zero number of kv heads
729
- int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
731
+ int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
730
732
  if (llama_model_has_encoder(&model)) {
731
- // now n_attn_layer is the number of attention layers in the encoder
733
+ // now n_layer_attn is the number of attention layers in the encoder
732
734
  // for each decoder block, there are 2 attention layers
733
- n_attn_layer += 2 * model.hparams.dec_n_layer;
735
+ n_layer_attn += 2 * model.hparams.dec_n_layer;
734
736
  }
735
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
737
+
738
+ // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
739
+ const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
740
+
741
+ LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
742
+
743
+ GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
736
744
  }
737
745
 
738
746
  size_t total_size_org = 0;
@@ -9,6 +9,8 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
9
9
  ggml_tensor * cur = build_inp_embd(model.tok_embd);
10
10
  cb(cur, "model.embed_tokens", -1);
11
11
 
12
+ ggml_build_forward_expand(gf, cur);
13
+
12
14
  ggml_tensor * inp_pos = build_inp_pos();
13
15
  auto * inp_hybrid = build_inp_mem_hybrid();
14
16
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -40,12 +42,12 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
40
42
  cur = ggml_add(ctx0, cur, ffn_out);
41
43
  }
42
44
 
43
- cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
44
- cb(cur, "model.embedding_norm", -1);
45
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
46
+ cb(cur, "result_norm", -1);
45
47
  res->t_embd = cur;
46
48
 
47
49
  cur = build_lora_mm(model.output, cur);
48
- cb(cur, "lm_head", -1);
50
+ cb(cur, "result_output", -1);
49
51
 
50
52
  res->t_logits = cur;
51
53
 
@@ -2,8 +2,9 @@
2
2
 
3
3
  #include "../llama-model.h"
4
4
  #include "../llama-graph.h"
5
- #include "../llama-memory-recurrent.h"
6
5
 
6
+ // TODO: remove in follow-up PR - move to .cpp files
7
+ #include "../llama-memory-recurrent.h"
7
8
  #include <cmath>
8
9
 
9
10
  struct llm_graph_context_mamba : public llm_graph_context {
@@ -421,7 +422,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
421
422
  struct llm_build_qwen3vlmoe : public llm_graph_context {
422
423
  llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
423
424
  };
425
+ struct llm_build_qwen3next : public llm_graph_context_mamba {
426
+ llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
427
+ private:
428
+ ggml_tensor * build_layer_attn(
429
+ llm_graph_input_attn_kv * inp_attn,
430
+ ggml_tensor * cur,
431
+ ggml_tensor * inp_pos,
432
+ int il);
433
+
434
+ ggml_tensor * build_layer_attn_linear(
435
+ llm_graph_input_rs * inp,
436
+ ggml_tensor * cur,
437
+ ggml_tensor * causal_mask,
438
+ ggml_tensor * identity,
439
+ int il);
424
440
 
441
+ ggml_tensor * build_layer_ffn(
442
+ ggml_tensor * cur,
443
+ int il);
444
+
445
+ ggml_tensor * build_delta_net_recurrent(
446
+ ggml_tensor * q,
447
+ ggml_tensor * k,
448
+ ggml_tensor * v,
449
+ ggml_tensor * g,
450
+ ggml_tensor * beta,
451
+ ggml_tensor * state,
452
+ ggml_tensor * causal_mask,
453
+ ggml_tensor * identity,
454
+ int il);
455
+
456
+ ggml_tensor * build_delta_net_chunking(
457
+ ggml_tensor * q,
458
+ ggml_tensor * k,
459
+ ggml_tensor * v,
460
+ ggml_tensor * g,
461
+ ggml_tensor * beta,
462
+ ggml_tensor * state,
463
+ ggml_tensor * causal_mask,
464
+ ggml_tensor * identity,
465
+ int il);
466
+
467
+ ggml_tensor * build_norm_gated(
468
+ ggml_tensor * input,
469
+ ggml_tensor * weights,
470
+ ggml_tensor * gate,
471
+ int layer);
472
+
473
+ const llama_model & model;
474
+ };
425
475
 
426
476
  struct llm_build_qwen : public llm_graph_context {
427
477
  llm_build_qwen(const llama_model & model, const llm_graph_params & params);