@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
39
39
  LLM_FFN_SWIGLU,
40
40
  LLM_FFN_GEGLU,
41
41
  LLM_FFN_REGLU,
42
+ LLM_FFN_SWIGLU_OAI_MOE,
42
43
  };
43
44
 
44
45
  enum llm_ffn_gate_type {
@@ -144,7 +145,7 @@ public:
144
145
 
145
146
  ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
146
147
 
147
- const llama_hparams & hparams;
148
+ const llama_hparams hparams;
148
149
  };
149
150
 
150
151
  class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +159,7 @@ public:
158
159
 
159
160
  ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
160
161
 
161
- const llama_hparams & hparams;
162
+ const llama_hparams hparams;
162
163
 
163
164
  const llama_kv_cache_unified_context * mctx;
164
165
  };
@@ -177,8 +178,8 @@ public:
177
178
 
178
179
  ggml_tensor * out_ids; // I32 [n_outputs]
179
180
 
180
- const llama_hparams & hparams;
181
- const llama_cparams & cparams;
181
+ const llama_hparams hparams;
182
+ const llama_cparams cparams;
182
183
 
183
184
  const uint32_t n_outputs;
184
185
  };
@@ -192,7 +193,7 @@ public:
192
193
 
193
194
  ggml_tensor * mean; // F32 [n_batch, n_batch]
194
195
 
195
- const llama_cparams & cparams;
196
+ const llama_cparams cparams;
196
197
  };
197
198
 
198
199
  class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +205,7 @@ public:
204
205
 
205
206
  ggml_tensor * cls; // I32 [n_batch]
206
207
 
207
- const llama_cparams & cparams;
208
+ const llama_cparams cparams;
208
209
  };
209
210
 
210
211
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -214,7 +215,12 @@ public:
214
215
 
215
216
  void set_input(const llama_ubatch * ubatch) override;
216
217
 
217
- ggml_tensor * s_copy; // I32 [kv_size]
218
+ ggml_tensor * s_copy; // I32 [n_rs]
219
+
220
+ // views of s_copy, computed once per graph
221
+ // and shared across layers which use build_rs
222
+ ggml_tensor * s_copy_main; // I32 [n_seqs]
223
+ ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
218
224
 
219
225
  const llama_memory_recurrent_context * mctx;
220
226
  };
@@ -247,8 +253,8 @@ public:
247
253
  ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
248
254
  ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
249
255
 
250
- const llama_hparams & hparams;
251
- const llama_cparams & cparams;
256
+ const llama_hparams hparams;
257
+ const llama_cparams cparams;
252
258
  };
253
259
 
254
260
  class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +284,11 @@ public:
278
284
  ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279
285
  ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
280
286
 
281
- const llama_hparams & hparams;
282
- const llama_cparams & cparams;
287
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
288
+ // need to carry these parameters with them. otherwise, they can point to freed
289
+ // llm_graph_params from a previous batch, causing stack-use-after-return
290
+ const llama_hparams hparams;
291
+ const llama_cparams cparams;
283
292
 
284
293
  const llama_kv_cache_unified_context * mctx;
285
294
  };
@@ -318,8 +327,8 @@ public:
318
327
  ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319
328
  ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
320
329
 
321
- const llama_hparams & hparams;
322
- const llama_cparams & cparams;
330
+ const llama_hparams hparams;
331
+ const llama_cparams cparams;
323
332
 
324
333
  const llama_kv_cache_unified_iswa_context * mctx;
325
334
  };
@@ -415,7 +424,9 @@ struct llm_graph_params {
415
424
  (!ubatch.embd && !other.ubatch.embd)
416
425
  );
417
426
 
418
- if (can_reuse_ubatch && !ubatch.equal_seqs()) {
427
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
428
+ // the reason is because the set of attention streams would be different for different sequences
429
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
419
430
  if (!ubatch.data) {
420
431
  // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
421
432
  // therefore we cannot perform the sequence id check. normally should never happen
@@ -609,6 +620,7 @@ struct llm_graph_context {
609
620
  llm_ffn_gate_type type_gate,
610
621
  int il) const;
611
622
 
623
+ // build MoE FFN without bias tensors
612
624
  ggml_tensor * build_moe_ffn(
613
625
  ggml_tensor * cur,
614
626
  ggml_tensor * gate_inp,
@@ -623,19 +635,29 @@ struct llm_graph_context {
623
635
  bool scale_w,
624
636
  float w_scale,
625
637
  llama_expert_gating_func_type gating_op,
626
- int il) const;
638
+ int il,
639
+ ggml_tensor * probs_in = nullptr) const;
627
640
 
628
- ggml_tensor * build_moe_ffn_from_probs(
641
+ ggml_tensor * build_moe_ffn(
629
642
  ggml_tensor * cur,
630
- ggml_tensor * probs,
643
+ ggml_tensor * gate_inp,
644
+ ggml_tensor * gate_inp_b,
631
645
  ggml_tensor * up_exps,
646
+ ggml_tensor * up_exps_b,
632
647
  ggml_tensor * gate_exps,
648
+ ggml_tensor * gate_exps_b,
633
649
  ggml_tensor * down_exps,
650
+ ggml_tensor * down_exps_b,
634
651
  ggml_tensor * exp_probs_b,
635
652
  int64_t n_expert,
636
653
  int64_t n_expert_used,
654
+ llm_ffn_op_type type_op,
655
+ bool norm_w,
656
+ bool scale_w,
657
+ float w_scale,
637
658
  llama_expert_gating_func_type gating_op,
638
- int il) const;
659
+ int il,
660
+ ggml_tensor * probs_in = nullptr) const;
639
661
 
640
662
  //
641
663
  // inputs
@@ -663,6 +685,7 @@ struct llm_graph_context {
663
685
  ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
664
686
  ggml_tensor * kq_b,
665
687
  ggml_tensor * kq_mask,
688
+ ggml_tensor * sinks,
666
689
  ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
667
690
  float kq_scale) const;
668
691
 
@@ -709,6 +732,20 @@ struct llm_graph_context {
709
732
  float kq_scale,
710
733
  int il) const;
711
734
 
735
+ // TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
736
+ ggml_tensor * build_attn_with_sinks(
737
+ llm_graph_input_attn_kv_unified_iswa * inp,
738
+ ggml_tensor * wo,
739
+ ggml_tensor * wo_b,
740
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
741
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
742
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
743
+ ggml_tensor * kq_b,
744
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
745
+ ggml_tensor * sinks, // [n_head_q]
746
+ float kq_scale,
747
+ int il) const;
748
+
712
749
  llm_graph_input_attn_cross * build_attn_inp_cross() const;
713
750
 
714
751
  ggml_tensor * build_attn(
@@ -727,7 +764,6 @@ struct llm_graph_context {
727
764
  // recurrent
728
765
  //
729
766
 
730
- // TODO: avoid notion of "kv"
731
767
  // TODO: move this implementation to llama_memory_recurrent.
732
768
  // this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
733
769
  // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
@@ -735,12 +771,13 @@ struct llm_graph_context {
735
771
  // `llama_memory_recurrent`
736
772
  ggml_tensor * build_rs(
737
773
  ggml_tensor * s,
738
- ggml_tensor * state_copy,
774
+ ggml_tensor * state_copy_main,
775
+ ggml_tensor * state_copy_extra,
739
776
  int32_t state_size,
740
777
  int32_t n_seqs,
741
- uint32_t n_kv,
742
- uint32_t kv_head,
743
- uint32_t kv_size,
778
+ uint32_t n_rs,
779
+ uint32_t rs_head,
780
+ uint32_t rs_size,
744
781
  int32_t rs_zero,
745
782
  const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
746
783
 
@@ -9,9 +9,10 @@
9
9
  #define LLAMA_MAX_EXPERTS 384 // Kimi-K2
10
10
 
11
11
  enum llama_expert_gating_func_type {
12
- LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
12
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
15
16
  };
16
17
 
17
18
  enum llama_swa_type {
@@ -73,6 +74,7 @@ struct llama_hparams {
73
74
  bool expert_weights_norm = false;
74
75
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
75
76
  uint32_t moe_every_n_layers = 0;
77
+ uint32_t nextn_predict_layers = 0;
76
78
 
77
79
  float f_norm_eps;
78
80
  float f_norm_rms_eps;
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
39
39
  if (model.arch == LLM_ARCH_GEMMA3N) {
40
40
  n_layer_cache = 20;
41
41
  }
42
+ if (model.arch == LLM_ARCH_GLM4_MOE) {
43
+ // GLM-4.5: Only process up to last layer, skip final NextN layer
44
+ n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
+ }
42
46
 
43
47
  // create a context for each buffer type
44
48
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
183
187
  const size_t memory_size_k = size_k_bytes();
184
188
  const size_t memory_size_v = size_v_bytes();
185
189
 
186
- LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
190
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
187
191
  (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
188
192
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
189
193
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
193
197
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
194
198
 
195
199
  const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
196
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
200
+ supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
197
201
 
198
202
  if (!supports_set_rows) {
199
203
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
@@ -230,7 +230,7 @@ private:
230
230
 
231
231
  // env: LLAMA_SET_ROWS (temporary)
232
232
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
233
- bool supports_set_rows = false;
233
+ bool supports_set_rows = true;
234
234
 
235
235
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
236
236
 
@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
25
25
  /* common */
26
26
  uint32_t n_seq_max,
27
27
  bool offload,
28
+ bool unified,
28
29
  /* layer filters */
29
30
  layer_filter_cb && filter_attn,
30
31
  layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
38
39
  type_v,
39
40
  v_trans,
40
41
  offload,
41
- 1,
42
+ unified,
42
43
  kv_size,
43
44
  n_seq_max,
44
45
  n_pad,
@@ -39,6 +39,7 @@ public:
39
39
  /* common */
40
40
  uint32_t n_seq_max,
41
41
  bool offload,
42
+ bool unified,
42
43
  /* layer filters */
43
44
  layer_filter_cb && filter_attn = nullptr,
44
45
  layer_filter_cb && filter_recr = nullptr);
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
35
35
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
36
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
37
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
38
39
  case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
40
  case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
41
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
@@ -58,8 +58,9 @@ struct llama_model_loader {
58
58
  }
59
59
  };
60
60
 
61
- static const int TENSOR_NOT_REQUIRED = 1;
62
- static const int TENSOR_DUPLICATED = 2;
61
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
62
+ static const int TENSOR_DUPLICATED = 1 << 1;
63
+ static const int TENSOR_SKIP = 1 << 2;
63
64
 
64
65
  int n_kv = 0;
65
66
  int n_tensors = 0;