@fugood/llama.node 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +20 -0
  8. package/src/common.hpp +8 -1
  9. package/src/llama.cpp/common/arg.cpp +13 -4
  10. package/src/llama.cpp/common/chat.cpp +33 -2
  11. package/src/llama.cpp/common/common.cpp +0 -15
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  14. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  22. package/src/llama.cpp/include/llama.h +1 -110
  23. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  24. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  25. package/src/llama.cpp/src/llama-arch.h +1 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +5 -197
  29. package/src/llama.cpp/src/llama-context.h +2 -7
  30. package/src/llama.cpp/src/llama-cparams.h +0 -1
  31. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  32. package/src/llama.cpp/src/llama-graph.h +36 -46
  33. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  34. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  35. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  36. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  37. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  40. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  41. package/src/llama.cpp/src/llama-memory.h +3 -8
  42. package/src/llama.cpp/src/llama-model.cpp +449 -246
  43. package/src/llama.cpp/src/llama-model.h +2 -0
@@ -14,27 +14,16 @@ struct llama_model;
14
14
  struct llama_context;
15
15
 
16
16
  //
17
- // llama_kv_cache_unified
17
+ // llama_kv_cache
18
18
  //
19
19
 
20
- class llama_kv_cache_unified : public llama_memory_i {
20
+ class llama_kv_cache : public llama_memory_i {
21
21
  public:
22
22
  static uint32_t get_padding(const llama_cparams & cparams);
23
23
 
24
24
  // this callback is used to filter out layers that should not be included in the cache
25
25
  using layer_filter_cb = std::function<bool(int32_t il)>;
26
26
 
27
- struct defrag_info {
28
- bool empty() const {
29
- return ids.empty();
30
- }
31
-
32
- // contains information about which cell moves where:
33
- // - cell i moves to ids[i]
34
- // - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
35
- std::vector<uint32_t> ids;
36
- };
37
-
38
27
  struct stream_copy_info {
39
28
  bool empty() const {
40
29
  assert(ssrc.size() == sdst.size());
@@ -92,7 +81,7 @@ public:
92
81
 
93
82
  using slot_info_vec_t = std::vector<slot_info>;
94
83
 
95
- llama_kv_cache_unified(
84
+ llama_kv_cache(
96
85
  const llama_model & model,
97
86
  layer_filter_cb && filter,
98
87
  ggml_type type_k,
@@ -106,7 +95,7 @@ public:
106
95
  uint32_t n_swa,
107
96
  llama_swa_type swa_type);
108
97
 
109
- ~llama_kv_cache_unified() = default;
98
+ ~llama_kv_cache() = default;
110
99
 
111
100
  //
112
101
  // llama_memory_i
@@ -140,7 +129,7 @@ public:
140
129
  void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
141
130
 
142
131
  //
143
- // llama_kv_cache_unified specific API
132
+ // llama_kv_cache specific API
144
133
  //
145
134
 
146
135
  uint32_t get_size() const;
@@ -173,7 +162,7 @@ public:
173
162
  // return empty vector on failure
174
163
  slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
175
164
 
176
- bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
165
+ bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
177
166
 
178
167
  // find a slot of kv cells that can hold the ubatch
179
168
  // if cont == true, then the slot must be continuous
@@ -241,7 +230,7 @@ private:
241
230
  // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
242
231
  std::vector<uint32_t> v_heads;
243
232
 
244
- std::vector<llama_kv_cells_unified> v_cells;
233
+ std::vector<llama_kv_cells> v_cells;
245
234
 
246
235
  // maps from a sequence id to a stream id
247
236
  std::vector<uint32_t> seq_to_stream;
@@ -254,9 +243,6 @@ private:
254
243
  // model layer id -> KV cache layer id
255
244
  std::unordered_map<int32_t, int32_t> map_layer_ids;
256
245
 
257
- // return non-empty vector if cells have been moved
258
- defrag_info defrag_prepare(int32_t n_max_nodes) const;
259
-
260
246
  size_t total_size() const;
261
247
 
262
248
  size_t size_k_bytes() const;
@@ -277,11 +263,6 @@ private:
277
263
  llm_graph_result * res,
278
264
  llama_context * lctx) const;
279
265
 
280
- ggml_cgraph * build_graph_defrag(
281
- llm_graph_result * res,
282
- llama_context * lctx,
283
- const defrag_info & dinfo) const;
284
-
285
266
  struct cell_ranges_t {
286
267
  uint32_t strm;
287
268
 
@@ -295,35 +276,33 @@ private:
295
276
  bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
296
277
  };
297
278
 
298
- class llama_kv_cache_unified_context : public llama_memory_context_i {
279
+ class llama_kv_cache_context : public llama_memory_context_i {
299
280
  public:
300
281
  // some shorthands
301
- using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
302
- using defrag_info = llama_kv_cache_unified::defrag_info;
303
- using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
282
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
283
+ using stream_copy_info = llama_kv_cache::stream_copy_info;
304
284
 
305
285
  // used for errors
306
- llama_kv_cache_unified_context(llama_memory_status status);
286
+ llama_kv_cache_context(llama_memory_status status);
307
287
 
308
288
  // used to create a full-cache context
309
- llama_kv_cache_unified_context(
310
- llama_kv_cache_unified * kv);
289
+ llama_kv_cache_context(
290
+ llama_kv_cache * kv);
311
291
 
312
292
  // used to create an update context
313
- llama_kv_cache_unified_context(
314
- llama_kv_cache_unified * kv,
293
+ llama_kv_cache_context(
294
+ llama_kv_cache * kv,
315
295
  llama_context * lctx,
316
296
  bool do_shift,
317
- defrag_info dinfo,
318
297
  stream_copy_info sc_info);
319
298
 
320
299
  // used to create a batch procesing context from a batch
321
- llama_kv_cache_unified_context(
322
- llama_kv_cache_unified * kv,
300
+ llama_kv_cache_context(
301
+ llama_kv_cache * kv,
323
302
  slot_info_vec_t sinfos,
324
303
  std::vector<llama_ubatch> ubatches);
325
304
 
326
- virtual ~llama_kv_cache_unified_context();
305
+ virtual ~llama_kv_cache_context();
327
306
 
328
307
  //
329
308
  // llama_memory_context_i
@@ -336,7 +315,7 @@ public:
336
315
  const llama_ubatch & get_ubatch() const override;
337
316
 
338
317
  //
339
- // llama_kv_cache_unified_context specific API
318
+ // llama_kv_cache_context specific API
340
319
  //
341
320
 
342
321
  uint32_t get_n_kv() const;
@@ -365,7 +344,7 @@ public:
365
344
  private:
366
345
  llama_memory_status status;
367
346
 
368
- llama_kv_cache_unified * kv;
347
+ llama_kv_cache * kv;
369
348
  llama_context * lctx;
370
349
 
371
350
  //
@@ -374,8 +353,6 @@ private:
374
353
 
375
354
  bool do_shift = false;
376
355
 
377
- defrag_info dinfo;
378
-
379
356
  stream_copy_info sc_info;
380
357
 
381
358
  //
@@ -11,7 +11,7 @@
11
11
 
12
12
  // meta information about KV cells that can be part of multiple sequences at the same time
13
13
  // TODO: add unit tests
14
- class llama_kv_cells_unified {
14
+ class llama_kv_cells {
15
15
  public:
16
16
  void reset() {
17
17
  for (uint32_t i = 0; i < pos.size(); ++i) {
@@ -77,30 +77,30 @@ public:
77
77
  }
78
78
 
79
79
  // move cell isrc to idst (used during defrag)
80
- void mv(uint32_t isrc, uint32_t idst) {
81
- assert(isrc < pos.size());
82
- assert(idst < pos.size());
80
+ //void mv(uint32_t isrc, uint32_t idst) {
81
+ // assert(isrc < pos.size());
82
+ // assert(idst < pos.size());
83
83
 
84
- assert(pos[idst] == -1);
85
- assert(pos[isrc] != -1);
84
+ // assert(pos[idst] == -1);
85
+ // assert(pos[isrc] != -1);
86
86
 
87
- pos [idst] = pos [isrc];
88
- shift[idst] = shift[isrc];
89
- seq [idst] = seq [isrc];
87
+ // pos [idst] = pos [isrc];
88
+ // shift[idst] = shift[isrc];
89
+ // seq [idst] = seq [isrc];
90
90
 
91
- pos [isrc] = -1;
92
- shift[isrc] = 0;
93
- seq [isrc].reset();
91
+ // pos [isrc] = -1;
92
+ // shift[isrc] = 0;
93
+ // seq [isrc].reset();
94
94
 
95
- used.erase (isrc);
96
- used.insert(idst);
97
- }
95
+ // used.erase (isrc);
96
+ // used.insert(idst);
97
+ //}
98
98
 
99
99
  // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
100
- llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
100
+ llama_kv_cells cp(uint32_t i, uint32_t n) const {
101
101
  assert(i + n <= pos.size());
102
102
 
103
- llama_kv_cells_unified res;
103
+ llama_kv_cells res;
104
104
 
105
105
  res.resize(n);
106
106
 
@@ -117,8 +117,8 @@ public:
117
117
  }
118
118
 
119
119
  // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
120
- llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
121
- llama_kv_cells_unified res;
120
+ llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
121
+ llama_kv_cells res;
122
122
 
123
123
  res.resize(idxs.size());
124
124
 
@@ -135,7 +135,7 @@ public:
135
135
  }
136
136
 
137
137
  // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
138
- void set(uint32_t i, const llama_kv_cells_unified & other) {
138
+ void set(uint32_t i, const llama_kv_cells & other) {
139
139
  assert(i + other.pos.size() <= pos.size());
140
140
 
141
141
  for (uint32_t j = 0; j < other.pos.size(); ++j) {
@@ -165,7 +165,7 @@ public:
165
165
  }
166
166
 
167
167
  // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
168
- void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
168
+ void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
169
169
  assert(idxs.size() == other.pos.size());
170
170
 
171
171
  for (uint32_t j = 0; j < other.pos.size(); ++j) {
@@ -30,7 +30,7 @@ llama_memory_hybrid::llama_memory_hybrid(
30
30
  layer_filter_cb && filter_attn,
31
31
  layer_filter_cb && filter_recr) :
32
32
  hparams(model.hparams),
33
- mem_attn(new llama_kv_cache_unified(
33
+ mem_attn(new llama_kv_cache(
34
34
  model,
35
35
  filter_attn == nullptr ?
36
36
  [&](int32_t il) { return !hparams.is_recurrent(il); }
@@ -179,7 +179,7 @@ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id,
179
179
  mem_recr->state_read(io, seq_id);
180
180
  }
181
181
 
182
- llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
182
+ llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
183
183
  return mem_attn.get();
184
184
  }
185
185
 
@@ -210,7 +210,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
210
210
  std::vector<llama_ubatch> ubatches) :
211
211
  ubatches(std::move(ubatches)),
212
212
  // note: here we copy the ubatches. not sure if this is ideal
213
- ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
213
+ ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
214
214
  ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
215
215
  status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
216
216
  }
@@ -248,8 +248,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
248
248
  return ubatches[i_next];
249
249
  }
250
250
 
251
- const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
252
- return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
251
+ const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
252
+ return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
253
253
  }
254
254
 
255
255
  const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
@@ -2,7 +2,7 @@
2
2
 
3
3
  #include "llama-batch.h"
4
4
  #include "llama-graph.h"
5
- #include "llama-kv-cache-unified.h"
5
+ #include "llama-kv-cache.h"
6
6
  #include "llama-memory.h"
7
7
  #include "llama-memory-recurrent.h"
8
8
 
@@ -13,7 +13,7 @@
13
13
  // llama_memory_hybrid
14
14
  //
15
15
 
16
- // utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
16
+ // utilizes instances of llama_memory_recurrent and llama_kv_cache to
17
17
  // support models where each layer may be either attention-based or recurrent
18
18
 
19
19
  class llama_memory_hybrid : public llama_memory_i {
@@ -81,19 +81,19 @@ public:
81
81
  // llama_memory_hybrid specific API
82
82
  //
83
83
 
84
- llama_kv_cache_unified * get_mem_attn() const;
84
+ llama_kv_cache * get_mem_attn() const;
85
85
  llama_memory_recurrent * get_mem_recr() const;
86
86
 
87
87
  private:
88
88
  const llama_hparams & hparams;
89
89
 
90
- const std::unique_ptr<llama_kv_cache_unified> mem_attn;
90
+ const std::unique_ptr<llama_kv_cache> mem_attn;
91
91
  const std::unique_ptr<llama_memory_recurrent> mem_recr;
92
92
  };
93
93
 
94
94
  class llama_memory_hybrid_context : public llama_memory_context_i {
95
95
  public:
96
- using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
96
+ using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
97
97
 
98
98
  // init failure
99
99
  explicit llama_memory_hybrid_context(llama_memory_status status);
@@ -125,7 +125,7 @@ public:
125
125
  // llama_memory_hybrid_context
126
126
  //
127
127
 
128
- const llama_kv_cache_unified_context * get_attn() const;
128
+ const llama_kv_cache_context * get_attn() const;
129
129
  const llama_memory_recurrent_context * get_recr() const;
130
130
 
131
131
  private:
@@ -12,7 +12,7 @@
12
12
  //
13
13
 
14
14
  // TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
15
- // see the implementation of llama_kv_cache_unified_context_i for an example how to do it
15
+ // see the implementation of llama_kv_cache_context_i for an example how to do it
16
16
  class llama_memory_recurrent : public llama_memory_i {
17
17
  public:
18
18
 
@@ -36,8 +36,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
36
36
 
37
37
  // the interface for managing the memory context during batch processing
38
38
  // this interface is implemented per memory type. see:
39
- // - llama_kv_cache_unified_context
40
- // - llama_kv_cache_unified_iswa_context
39
+ // - llama_kv_cache_context
40
+ // - llama_kv_cache_iswa_context
41
41
  // ...
42
42
  //
43
43
  // the only method that should mutate the memory and the memory context is llama_memory_i::apply()
@@ -77,7 +77,7 @@ struct llama_memory_i {
77
77
  // simulate full cache, used for allocating worst-case compute buffers
78
78
  virtual llama_memory_context_ptr init_full() = 0;
79
79
 
80
- // prepare for any pending memory updates, such as shifts, defrags, etc.
80
+ // prepare for any pending memory updates, such as shifts, copies, etc.
81
81
  // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
82
82
  virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
83
83
 
@@ -109,8 +109,3 @@ struct llama_memory_i {
109
109
  };
110
110
 
111
111
  using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
112
-
113
- // TODO: temporary until the llama_kv_cache is removed from the public API
114
- struct llama_kv_cache : public llama_memory_i {
115
- virtual ~llama_kv_cache() = default;
116
- };