@fugood/llama.node 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  3. package/src/llama.cpp/common/arg.cpp +37 -0
  4. package/src/llama.cpp/common/common.cpp +22 -6
  5. package/src/llama.cpp/common/common.h +14 -1
  6. package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
  7. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  8. package/src/llama.cpp/ggml/include/ggml.h +13 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
  12. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
  14. package/src/llama.cpp/include/llama.h +13 -48
  15. package/src/llama.cpp/src/llama-arch.cpp +222 -15
  16. package/src/llama.cpp/src/llama-arch.h +16 -1
  17. package/src/llama.cpp/src/llama-batch.cpp +76 -70
  18. package/src/llama.cpp/src/llama-batch.h +24 -18
  19. package/src/llama.cpp/src/llama-chat.cpp +44 -1
  20. package/src/llama.cpp/src/llama-chat.h +2 -0
  21. package/src/llama.cpp/src/llama-context.cpp +134 -95
  22. package/src/llama.cpp/src/llama-context.h +13 -16
  23. package/src/llama.cpp/src/llama-cparams.h +3 -2
  24. package/src/llama.cpp/src/llama-graph.cpp +239 -154
  25. package/src/llama.cpp/src/llama-graph.h +162 -126
  26. package/src/llama.cpp/src/llama-hparams.cpp +45 -0
  27. package/src/llama.cpp/src/llama-hparams.h +11 -1
  28. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  29. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  30. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  31. package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  32. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
  34. package/src/llama.cpp/src/llama-model.cpp +2309 -665
  35. package/src/llama.cpp/src/llama-model.h +18 -4
  36. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  37. package/src/llama.cpp/src/llama-vocab.cpp +368 -9
  38. package/src/llama.cpp/src/llama-vocab.h +43 -0
  39. package/src/llama.cpp/src/unicode.cpp +207 -0
  40. package/src/llama.cpp/src/unicode.h +2 -0
@@ -35,16 +35,50 @@ public:
35
35
  std::vector<uint32_t> ids;
36
36
  };
37
37
 
38
+ struct stream_copy_info {
39
+ bool empty() const {
40
+ assert(ssrc.size() == sdst.size());
41
+ return ssrc.empty();
42
+ }
43
+
44
+ std::vector<uint32_t> ssrc;
45
+ std::vector<uint32_t> sdst;
46
+ };
47
+
38
48
  // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
39
49
  // KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
40
50
  struct slot_info {
41
51
  // data for ggml_set_rows
42
52
  using idx_vec_t = std::vector<uint32_t>;
43
53
 
44
- idx_vec_t idxs;
54
+ // number of streams: ns = s1 - s0 + 1
55
+ llama_seq_id s0;
56
+ llama_seq_id s1;
57
+
58
+ std::vector<llama_seq_id> strm; // [ns]
59
+ std::vector<idx_vec_t> idxs; // [ns]
45
60
 
46
61
  uint32_t head() const {
47
- return idxs.at(0);
62
+ GGML_ASSERT(idxs.size() == 1);
63
+ GGML_ASSERT(!idxs[0].empty());
64
+
65
+ return idxs[0][0];
66
+ }
67
+
68
+ void resize(size_t n) {
69
+ strm.resize(n);
70
+ idxs.resize(n);
71
+ }
72
+
73
+ size_t size() const {
74
+ GGML_ASSERT(idxs.size() == strm.size());
75
+ GGML_ASSERT(!idxs.empty());
76
+
77
+ return idxs[0].size();
78
+ }
79
+
80
+ size_t n_stream() const {
81
+ return strm.size();
48
82
  }
49
83
 
50
84
  bool empty() const {
@@ -54,9 +88,6 @@ public:
54
88
  void clear() {
55
89
  idxs.clear();
56
90
  }
57
-
58
- // TODO: implement
59
- //std::vector<idx_vec_t> seq_idxs;
60
91
  };
61
92
 
62
93
  using slot_info_vec_t = std::vector<slot_info>;
@@ -68,6 +99,7 @@ public:
68
99
  ggml_type type_v,
69
100
  bool v_trans,
70
101
  bool offload,
102
+ bool unified,
71
103
  uint32_t kv_size,
72
104
  uint32_t n_seq_max,
73
105
  uint32_t n_pad,
@@ -111,7 +143,8 @@ public:
111
143
  // llama_kv_cache_unified specific API
112
144
  //
113
145
 
114
- uint32_t get_size() const;
146
+ uint32_t get_size() const;
147
+ uint32_t get_n_stream() const;
115
148
 
116
149
  bool get_has_shift() const;
117
150
 
@@ -121,9 +154,12 @@ public:
121
154
 
122
155
  uint32_t get_n_kv() const;
123
156
 
157
+ // TODO: temporary
158
+ bool get_supports_set_rows() const;
159
+
124
160
  // get views of the current state of the cache
125
- ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
126
- ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
161
+ ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
162
+ ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
127
163
 
128
164
  // store k_cur and v_cur in the cache based on the provided head location
129
165
  ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
@@ -137,7 +173,7 @@ public:
137
173
  // return empty vector on failure
138
174
  slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
139
175
 
140
- bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
176
+ bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
141
177
 
142
178
  // find a slot of kv cells that can hold the ubatch
143
179
  // if cont == true, then the slot must be continuous
@@ -157,8 +193,9 @@ public:
157
193
  void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
158
194
  void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
159
195
 
196
+ void set_input_k_shift(ggml_tensor * dst) const;
197
+
160
198
  void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
161
- void set_input_k_shift (ggml_tensor * dst) const;
162
199
  void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
163
200
 
164
201
  private:
@@ -172,15 +209,15 @@ private:
172
209
 
173
210
  ggml_tensor * k;
174
211
  ggml_tensor * v;
212
+
213
+ std::vector<ggml_tensor *> k_stream;
214
+ std::vector<ggml_tensor *> v_stream;
175
215
  };
176
216
 
177
217
  bool v_trans = true; // the value tensor is transposed
178
218
 
179
- // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
180
- // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
181
- uint32_t head = 0;
182
-
183
219
  const uint32_t n_seq_max = 1;
220
+ const uint32_t n_stream = 1;
184
221
 
185
222
  // required padding
186
223
  const uint32_t n_pad = 1;
@@ -193,14 +230,24 @@ private:
193
230
 
194
231
  // env: LLAMA_SET_ROWS (temporary)
195
232
  // ref: https://github.com/ggml-org/llama.cpp/pull/14285
196
- int supports_set_rows = false;
233
+ bool supports_set_rows = false;
197
234
 
198
235
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
199
236
 
200
237
  std::vector<ggml_context_ptr> ctxs;
201
238
  std::vector<ggml_backend_buffer_ptr> bufs;
202
239
 
203
- llama_kv_cells_unified cells;
240
+ // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
241
+ // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
242
+ std::vector<uint32_t> v_heads;
243
+
244
+ std::vector<llama_kv_cells_unified> v_cells;
245
+
246
+ // maps from a sequence id to a stream id
247
+ std::vector<uint32_t> seq_to_stream;
248
+
249
+ // pending stream copies that will be applied during the next update
250
+ stream_copy_info sc_info;
204
251
 
205
252
  std::vector<kv_layer> layers;
206
253
 
@@ -226,29 +273,34 @@ private:
226
273
  float freq_base,
227
274
  float freq_scale) const;
228
275
 
229
- llm_graph_result_ptr build_graph_shift(
230
- const llama_cparams & cparams,
231
- ggml_context * ctx,
232
- ggml_cgraph * gf) const;
276
+ ggml_cgraph * build_graph_shift(
277
+ llm_graph_result * res,
278
+ llama_context * lctx) const;
233
279
 
234
- llm_graph_result_ptr build_graph_defrag(
235
- const llama_cparams & cparams,
236
- ggml_context * ctx,
237
- ggml_cgraph * gf,
280
+ ggml_cgraph * build_graph_defrag(
281
+ llm_graph_result * res,
282
+ llama_context * lctx,
238
283
  const defrag_info & dinfo) const;
239
284
 
240
- void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
241
- void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
285
+ struct cell_ranges_t {
286
+ uint32_t strm;
242
287
 
243
- bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
244
- bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
288
+ std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
289
+ };
290
+
291
+ void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
292
+ void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
293
+
294
+ bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
295
+ bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
245
296
  };
246
297
 
247
298
  class llama_kv_cache_unified_context : public llama_memory_context_i {
248
299
  public:
249
300
  // some shorthands
250
- using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
251
- using defrag_info = llama_kv_cache_unified::defrag_info;
301
+ using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
302
+ using defrag_info = llama_kv_cache_unified::defrag_info;
303
+ using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
252
304
 
253
305
  // used for errors
254
306
  llama_kv_cache_unified_context(llama_memory_status status);
@@ -262,7 +314,8 @@ public:
262
314
  llama_kv_cache_unified * kv,
263
315
  llama_context * lctx,
264
316
  bool do_shift,
265
- defrag_info dinfo);
317
+ defrag_info dinfo,
318
+ stream_copy_info sc_info);
266
319
 
267
320
  // used to create a batch procesing context from a batch
268
321
  llama_kv_cache_unified_context(
@@ -288,6 +341,9 @@ public:
288
341
 
289
342
  uint32_t get_n_kv() const;
290
343
 
344
+ // TODO: temporary
345
+ bool get_supports_set_rows() const;
346
+
291
347
  // get views of the current state of the cache
292
348
  ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
293
349
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
@@ -320,6 +376,8 @@ private:
320
376
 
321
377
  defrag_info dinfo;
322
378
 
379
+ stream_copy_info sc_info;
380
+
323
381
  //
324
382
  // batch processing context
325
383
  //
@@ -38,6 +38,7 @@ llama_memory_hybrid::llama_memory_hybrid(
38
38
  type_v,
39
39
  v_trans,
40
40
  offload,
41
+ 1,
41
42
  kv_size,
42
43
  n_seq_max,
43
44
  n_pad,
@@ -25,9 +25,6 @@ llama_memory_recurrent::llama_memory_recurrent(
25
25
  uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
26
26
  const int32_t n_layer = hparams.n_layer;
27
27
 
28
- LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n",
29
- __func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer);
30
-
31
28
  head = 0;
32
29
  size = mem_size;
33
30
  used = 0;
@@ -84,7 +81,7 @@ llama_memory_recurrent::llama_memory_recurrent(
84
81
 
85
82
  ggml_context * ctx = ctx_for_buft(buft);
86
83
  if (!ctx) {
87
- throw std::runtime_error("failed to create ggml context for kv cache");
84
+ throw std::runtime_error("failed to create ggml context for rs cache");
88
85
  }
89
86
 
90
87
  ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
@@ -102,10 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent(
102
99
 
103
100
  ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
104
101
  if (!buf) {
105
- throw std::runtime_error("failed to allocate buffer for kv cache");
102
+ throw std::runtime_error("failed to allocate buffer for rs cache");
106
103
  }
107
104
  ggml_backend_buffer_clear(buf, 0);
108
- LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
105
+ LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
109
106
  bufs.emplace_back(buf);
110
107
  }
111
108
 
@@ -113,8 +110,8 @@ llama_memory_recurrent::llama_memory_recurrent(
113
110
  const size_t memory_size_r = size_r_bytes();
114
111
  const size_t memory_size_s = size_s_bytes();
115
112
 
116
- LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
117
- (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f),
113
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
114
+ (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
118
115
  ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
119
116
  ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
120
117
  }
@@ -449,7 +446,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
449
446
  // A slot should be always be contiguous.
450
447
 
451
448
  // can only process batches with an equal number of new tokens in each sequence
452
- GGML_ASSERT(ubatch.equal_seqs);
449
+ GGML_ASSERT(ubatch.equal_seqs());
453
450
 
454
451
  int32_t min = size - 1;
455
452
  int32_t max = 0;