@fugood/llama.node 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +9 -9
  8. package/src/LlamaCompletionWorker.cpp +73 -20
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/LlamaContext.cpp +9 -0
  11. package/src/common.hpp +8 -1
  12. package/src/llama.cpp/CMakeLists.txt +2 -0
  13. package/src/llama.cpp/common/arg.cpp +132 -41
  14. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  15. package/src/llama.cpp/common/chat.cpp +311 -9
  16. package/src/llama.cpp/common/chat.h +4 -1
  17. package/src/llama.cpp/common/common.cpp +54 -0
  18. package/src/llama.cpp/common/common.h +46 -9
  19. package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
  20. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  21. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  22. package/src/llama.cpp/ggml/include/ggml.h +28 -2
  23. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
  33. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  37. package/src/llama.cpp/include/llama.h +25 -0
  38. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  39. package/src/llama.cpp/src/llama-chat.cpp +2 -4
  40. package/src/llama.cpp/src/llama-context.cpp +29 -22
  41. package/src/llama.cpp/src/llama-context.h +6 -5
  42. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  43. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  44. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
  45. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  46. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  47. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  48. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  49. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  50. package/src/llama.cpp/src/llama-memory.h +2 -2
  51. package/src/llama.cpp/src/llama-model.cpp +81 -70
  52. package/src/llama.cpp/src/llama-model.h +2 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  54. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
194
194
  return kv_base->get_size() == kv_swa->get_size();
195
195
  }
196
196
 
197
- void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
198
- kv_base->state_write(io, seq_id);
199
- kv_swa ->state_write(io, seq_id);
197
+ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
198
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
199
+ kv_base->state_write(io, seq_id, flags);
200
+ }
201
+
202
+ kv_swa->state_write(io, seq_id, flags);
200
203
  }
201
204
 
202
- void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
203
- kv_base->state_read(io, seq_id);
204
- kv_swa ->state_read(io, seq_id);
205
+ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
206
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
207
+ kv_base->state_read(io, seq_id, flags);
208
+ }
209
+
210
+ kv_swa->state_read(io, seq_id, flags);
205
211
  }
206
212
 
207
213
  llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
@@ -56,8 +56,8 @@ public:
56
56
 
57
57
  // state write/load
58
58
 
59
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
60
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
59
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
60
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
61
61
 
62
62
  //
63
63
  // llama_kv_cache_unified_iswa specific API
@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
223
223
  }
224
224
 
225
225
  bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
226
- GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
227
-
228
- auto & cells = v_cells[seq_to_stream[seq_id]];
229
- auto & head = v_heads[seq_to_stream[seq_id]];
230
-
231
- uint32_t new_head = cells.size();
226
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
232
227
 
233
228
  if (p0 < 0) {
234
229
  p0 = 0;
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
239
234
  }
240
235
 
241
236
  if (seq_id >= 0) {
237
+ auto & cells = v_cells[seq_to_stream[seq_id]];
238
+ auto & head = v_heads[seq_to_stream[seq_id]];
239
+
240
+ uint32_t new_head = cells.size();
241
+
242
242
  for (uint32_t i = 0; i < cells.size(); ++i) {
243
243
  if (!cells.pos_in(i, p0, p1)) {
244
244
  continue;
@@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
250
250
  }
251
251
  }
252
252
  }
253
+
254
+ // If we freed up a slot, set head to it so searching can start there.
255
+ if (new_head != cells.size() && new_head < head) {
256
+ head = new_head;
257
+ }
253
258
  } else {
254
259
  // match any sequence
255
- for (uint32_t i = 0; i < cells.size(); ++i) {
256
- if (!cells.pos_in(i, p0, p1)) {
257
- continue;
258
- }
260
+ for (uint32_t s = 0; s < n_stream; ++s) {
261
+ auto & cells = v_cells[s];
262
+ auto & head = v_heads[s];
259
263
 
260
- cells.rm(i);
264
+ uint32_t new_head = cells.size();
261
265
 
262
- if (new_head == cells.size()) {
263
- new_head = i;
266
+ for (uint32_t i = 0; i < cells.size(); ++i) {
267
+ if (!cells.pos_in(i, p0, p1)) {
268
+ continue;
269
+ }
270
+
271
+ cells.rm(i);
272
+
273
+ if (new_head == cells.size()) {
274
+ new_head = i;
275
+ }
264
276
  }
265
- }
266
- }
267
277
 
268
- // If we freed up a slot, set head to it so searching can start there.
269
- if (new_head != cells.size() && new_head < head) {
270
- head = new_head;
278
+ // If we freed up a slot, set head to it so searching can start there.
279
+ if (new_head != cells.size() && new_head < head) {
280
+ head = new_head;
281
+ }
282
+ }
271
283
  }
272
284
 
273
285
  return true;
@@ -738,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
738
750
  }
739
751
 
740
752
  llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
741
- if (debug > 0) {
742
- const auto & cells = v_cells[seq_to_stream[1]];
743
753
 
744
- const uint32_t head_cur = v_heads[1];
745
-
746
- LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
747
- __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
754
+ if (debug > 0) {
755
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
756
+ const auto seq_id = ubatch.seq_id_unq[s];
757
+ const auto stream_id = seq_to_stream[seq_id];
758
+ const auto & cells = v_cells[stream_id];
759
+ const uint32_t head_cur = v_heads[stream_id];
760
+
761
+ LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
762
+ __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
763
+
764
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
765
+ std::string ss;
766
+ for (uint32_t i = 0; i < cells.size(); ++i) {
767
+ if (cells.is_empty(i)) {
768
+ ss += '.';
769
+ } else {
770
+ assert(cells.seq_count(i) >= 1);
748
771
 
749
- if ((debug == 2 && n_swa > 0) || debug > 2) {
750
- std::string ss;
751
- for (uint32_t i = 0; i < cells.size(); ++i) {
752
- if (cells.is_empty(i)) {
753
- ss += '.';
754
- } else {
755
- assert(cells.seq_count(i) >= 1);
772
+ if (cells.seq_count(i) == 1) {
773
+ ss += std::to_string(cells.seq_get(i));
774
+ } else {
775
+ ss += 'M';
776
+ }
777
+ }
778
+ if (i%256 == 255) {
779
+ ss += " *";
780
+ ss += '\n';
781
+ }
782
+ }
783
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
784
+ }
756
785
 
757
- if (cells.seq_count(i) == 1) {
758
- ss += std::to_string(cells.seq_get(i));
786
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
787
+ std::string ss;
788
+ for (uint32_t i = 0; i < cells.size(); ++i) {
789
+ std::string cur;
790
+ if (cells.is_empty(i)) {
791
+ cur = '.';
759
792
  } else {
760
- ss += 'M';
793
+ cur = std::to_string(cells.pos_get(i));
794
+ }
795
+ const int n = cur.size();
796
+ for (int j = 0; j < 5 - n; ++j) {
797
+ cur += ' ';
798
+ }
799
+ ss += cur;
800
+ if (i%256 == 255) {
801
+ ss += " *";
802
+ }
803
+ if (i%64 == 63) {
804
+ ss += '\n';
761
805
  }
762
806
  }
763
- if (i%256 == 255) {
764
- ss += " *";
765
- ss += '\n';
766
- }
807
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
767
808
  }
768
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
769
- }
770
809
 
771
- if ((debug == 2 && n_swa > 0) || debug > 2) {
772
- std::string ss;
773
- for (uint32_t i = 0; i < cells.size(); ++i) {
774
- std::string cur;
775
- if (cells.is_empty(i)) {
776
- cur = '.';
777
- } else {
778
- cur = std::to_string(cells.pos_get(i));
779
- }
780
- const int n = cur.size();
781
- for (int j = 0; j < 5 - n; ++j) {
782
- cur += ' ';
783
- }
784
- ss += cur;
785
- if (i%256 == 255) {
786
- ss += " *";
787
- }
788
- if (i%64 == 63) {
789
- ss += '\n';
810
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
811
+ if (cells.seq_pos_min(s) < 0) {
812
+ continue;
790
813
  }
791
- }
792
- LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
793
- }
794
814
 
795
- for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
796
- if (cells.seq_pos_min(s) < 0) {
797
- continue;
815
+ LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
798
816
  }
799
-
800
- LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
801
817
  }
802
818
  }
803
819
 
@@ -1812,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1812
1828
  return false;
1813
1829
  }
1814
1830
 
1815
- void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1831
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1832
+ GGML_UNUSED(flags);
1833
+
1816
1834
  io.write(&n_stream, sizeof(n_stream));
1817
1835
 
1818
1836
  for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1863,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1863
1881
  }
1864
1882
  }
1865
1883
 
1866
- void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1884
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1885
+ GGML_UNUSED(flags);
1886
+
1867
1887
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1868
1888
 
1869
1889
  uint32_t n_stream_cur;
@@ -136,8 +136,8 @@ public:
136
136
 
137
137
  // state write/load
138
138
 
139
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
140
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
139
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
140
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
141
141
 
142
142
  //
143
143
  // llama_kv_cache_unified specific API
@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
165
165
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
166
166
  }
167
167
 
168
- void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
168
+ void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
169
+ GGML_UNUSED(flags);
170
+
169
171
  mem_attn->state_write(io, seq_id);
170
172
  mem_recr->state_write(io, seq_id);
171
173
  }
172
174
 
173
- void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
175
+ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
176
+ GGML_UNUSED(flags);
177
+
174
178
  mem_attn->state_read(io, seq_id);
175
179
  mem_recr->state_read(io, seq_id);
176
180
  }
@@ -74,8 +74,8 @@ public:
74
74
 
75
75
  // state write/load
76
76
 
77
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
78
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
77
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
78
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
79
79
 
80
80
  //
81
81
  // llama_memory_hybrid specific API
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
680
680
  return size_s_bytes;
681
681
  }
682
682
 
683
- void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
683
+ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
684
+ GGML_UNUSED(flags);
685
+
684
686
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
685
687
  uint32_t cell_count = 0;
686
688
 
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
718
720
  state_write_data(io, cell_ranges);
719
721
  }
720
722
 
721
- void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
723
+ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
724
+ GGML_UNUSED(flags);
725
+
722
726
  uint32_t cell_count;
723
727
  io.read_to(&cell_count, sizeof(cell_count));
724
728
 
@@ -63,8 +63,8 @@ public:
63
63
 
64
64
  // state write/load
65
65
 
66
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
67
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
66
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
67
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
68
68
 
69
69
  uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
70
70
  uint32_t size = 0; // total number of cells, shared across all sequences
@@ -104,8 +104,8 @@ struct llama_memory_i {
104
104
  // state write/read
105
105
  //
106
106
 
107
- virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
108
- virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
107
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
108
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
109
109
  };
110
110
 
111
111
  using llama_memory_ptr = std::unique_ptr<llama_memory_i>;