@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
39
39
  if (model.arch == LLM_ARCH_GEMMA3N) {
40
40
  n_layer_cache = 20;
41
41
  }
42
+ if (model.arch == LLM_ARCH_GLM4_MOE) {
43
+ // GLM-4.5: Only process up to last layer, skip final NextN layer
44
+ n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
+ }
42
46
 
43
47
  // create a context for each buffer type
44
48
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -219,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
219
223
  }
220
224
 
221
225
  bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
222
- GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
223
-
224
- auto & cells = v_cells[seq_to_stream[seq_id]];
225
- auto & head = v_heads[seq_to_stream[seq_id]];
226
-
227
- uint32_t new_head = cells.size();
226
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
228
227
 
229
228
  if (p0 < 0) {
230
229
  p0 = 0;
@@ -235,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
235
234
  }
236
235
 
237
236
  if (seq_id >= 0) {
237
+ auto & cells = v_cells[seq_to_stream[seq_id]];
238
+ auto & head = v_heads[seq_to_stream[seq_id]];
239
+
240
+ uint32_t new_head = cells.size();
241
+
238
242
  for (uint32_t i = 0; i < cells.size(); ++i) {
239
243
  if (!cells.pos_in(i, p0, p1)) {
240
244
  continue;
@@ -246,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
246
250
  }
247
251
  }
248
252
  }
253
+
254
+ // If we freed up a slot, set head to it so searching can start there.
255
+ if (new_head != cells.size() && new_head < head) {
256
+ head = new_head;
257
+ }
249
258
  } else {
250
259
  // match any sequence
251
- for (uint32_t i = 0; i < cells.size(); ++i) {
252
- if (!cells.pos_in(i, p0, p1)) {
253
- continue;
254
- }
260
+ for (uint32_t s = 0; s < n_stream; ++s) {
261
+ auto & cells = v_cells[s];
262
+ auto & head = v_heads[s];
255
263
 
256
- cells.rm(i);
264
+ uint32_t new_head = cells.size();
257
265
 
258
- if (new_head == cells.size()) {
259
- new_head = i;
266
+ for (uint32_t i = 0; i < cells.size(); ++i) {
267
+ if (!cells.pos_in(i, p0, p1)) {
268
+ continue;
269
+ }
270
+
271
+ cells.rm(i);
272
+
273
+ if (new_head == cells.size()) {
274
+ new_head = i;
275
+ }
260
276
  }
261
- }
262
- }
263
277
 
264
- // If we freed up a slot, set head to it so searching can start there.
265
- if (new_head != cells.size() && new_head < head) {
266
- head = new_head;
278
+ // If we freed up a slot, set head to it so searching can start there.
279
+ if (new_head != cells.size() && new_head < head) {
280
+ head = new_head;
281
+ }
282
+ }
267
283
  }
268
284
 
269
285
  return true;
@@ -734,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
734
750
  }
735
751
 
736
752
  llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
737
- if (debug > 0) {
738
- const auto & cells = v_cells[seq_to_stream[1]];
739
753
 
740
- const uint32_t head_cur = v_heads[1];
741
-
742
- LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
743
- __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
754
+ if (debug > 0) {
755
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
756
+ const auto seq_id = ubatch.seq_id_unq[s];
757
+ const auto stream_id = seq_to_stream[seq_id];
758
+ const auto & cells = v_cells[stream_id];
759
+ const uint32_t head_cur = v_heads[stream_id];
760
+
761
+ LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
762
+ __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
763
+
764
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
765
+ std::string ss;
766
+ for (uint32_t i = 0; i < cells.size(); ++i) {
767
+ if (cells.is_empty(i)) {
768
+ ss += '.';
769
+ } else {
770
+ assert(cells.seq_count(i) >= 1);
744
771
 
745
- if ((debug == 2 && n_swa > 0) || debug > 2) {
746
- std::string ss;
747
- for (uint32_t i = 0; i < cells.size(); ++i) {
748
- if (cells.is_empty(i)) {
749
- ss += '.';
750
- } else {
751
- assert(cells.seq_count(i) >= 1);
772
+ if (cells.seq_count(i) == 1) {
773
+ ss += std::to_string(cells.seq_get(i));
774
+ } else {
775
+ ss += 'M';
776
+ }
777
+ }
778
+ if (i%256 == 255) {
779
+ ss += " *";
780
+ ss += '\n';
781
+ }
782
+ }
783
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
784
+ }
752
785
 
753
- if (cells.seq_count(i) == 1) {
754
- ss += std::to_string(cells.seq_get(i));
786
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
787
+ std::string ss;
788
+ for (uint32_t i = 0; i < cells.size(); ++i) {
789
+ std::string cur;
790
+ if (cells.is_empty(i)) {
791
+ cur = '.';
755
792
  } else {
756
- ss += 'M';
793
+ cur = std::to_string(cells.pos_get(i));
794
+ }
795
+ const int n = cur.size();
796
+ for (int j = 0; j < 5 - n; ++j) {
797
+ cur += ' ';
798
+ }
799
+ ss += cur;
800
+ if (i%256 == 255) {
801
+ ss += " *";
802
+ }
803
+ if (i%64 == 63) {
804
+ ss += '\n';
757
805
  }
758
806
  }
759
- if (i%256 == 255) {
760
- ss += " *";
761
- ss += '\n';
762
- }
807
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
763
808
  }
764
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
765
- }
766
809
 
767
- if ((debug == 2 && n_swa > 0) || debug > 2) {
768
- std::string ss;
769
- for (uint32_t i = 0; i < cells.size(); ++i) {
770
- std::string cur;
771
- if (cells.is_empty(i)) {
772
- cur = '.';
773
- } else {
774
- cur = std::to_string(cells.pos_get(i));
775
- }
776
- const int n = cur.size();
777
- for (int j = 0; j < 5 - n; ++j) {
778
- cur += ' ';
779
- }
780
- ss += cur;
781
- if (i%256 == 255) {
782
- ss += " *";
783
- }
784
- if (i%64 == 63) {
785
- ss += '\n';
810
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
811
+ if (cells.seq_pos_min(s) < 0) {
812
+ continue;
786
813
  }
787
- }
788
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
789
- }
790
814
 
791
- for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
792
- if (cells.seq_pos_min(s) < 0) {
793
- continue;
815
+ LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
794
816
  }
795
-
796
- LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
797
817
  }
798
818
  }
799
819
 
@@ -1808,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1808
1828
  return false;
1809
1829
  }
1810
1830
 
1811
- void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1831
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1832
+ GGML_UNUSED(flags);
1833
+
1812
1834
  io.write(&n_stream, sizeof(n_stream));
1813
1835
 
1814
1836
  for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1859,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1859
1881
  }
1860
1882
  }
1861
1883
 
1862
- void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1884
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1885
+ GGML_UNUSED(flags);
1886
+
1863
1887
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1864
1888
 
1865
1889
  uint32_t n_stream_cur;
@@ -136,8 +136,8 @@ public:
136
136
 
137
137
  // state write/load
138
138
 
139
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
140
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
139
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
140
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
141
141
 
142
142
  //
143
143
  // llama_kv_cache_unified specific API
@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
165
165
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
166
166
  }
167
167
 
168
- void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
168
+ void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
169
+ GGML_UNUSED(flags);
170
+
169
171
  mem_attn->state_write(io, seq_id);
170
172
  mem_recr->state_write(io, seq_id);
171
173
  }
172
174
 
173
- void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
175
+ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
176
+ GGML_UNUSED(flags);
177
+
174
178
  mem_attn->state_read(io, seq_id);
175
179
  mem_recr->state_read(io, seq_id);
176
180
  }
@@ -74,8 +74,8 @@ public:
74
74
 
75
75
  // state write/load
76
76
 
77
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
78
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
77
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
78
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
79
79
 
80
80
  //
81
81
  // llama_memory_hybrid specific API
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
680
680
  return size_s_bytes;
681
681
  }
682
682
 
683
- void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
683
+ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
684
+ GGML_UNUSED(flags);
685
+
684
686
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
685
687
  uint32_t cell_count = 0;
686
688
 
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
718
720
  state_write_data(io, cell_ranges);
719
721
  }
720
722
 
721
- void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
723
+ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
724
+ GGML_UNUSED(flags);
725
+
722
726
  uint32_t cell_count;
723
727
  io.read_to(&cell_count, sizeof(cell_count));
724
728
 
@@ -63,8 +63,8 @@ public:
63
63
 
64
64
  // state write/load
65
65
 
66
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
67
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
66
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
67
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
68
68
 
69
69
  uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
70
70
  uint32_t size = 0; // total number of cells, shared across all sequences
@@ -104,8 +104,8 @@ struct llama_memory_i {
104
104
  // state write/read
105
105
  //
106
106
 
107
- virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
108
- virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
107
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
108
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
109
109
  };
110
110
 
111
111
  using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
35
35
  case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
36
  case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
37
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
38
39
  case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
40
  case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
41
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
@@ -58,8 +58,9 @@ struct llama_model_loader {
58
58
  }
59
59
  };
60
60
 
61
- static const int TENSOR_NOT_REQUIRED = 1;
62
- static const int TENSOR_DUPLICATED = 2;
61
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
62
+ static const int TENSOR_DUPLICATED = 1 << 1;
63
+ static const int TENSOR_SKIP = 1 << 2;
63
64
 
64
65
  int n_kv = 0;
65
66
  int n_tensors = 0;