@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
|
@@ -35,16 +35,50 @@ public:
|
|
|
35
35
|
std::vector<uint32_t> ids;
|
|
36
36
|
};
|
|
37
37
|
|
|
38
|
+
struct stream_copy_info {
|
|
39
|
+
bool empty() const {
|
|
40
|
+
assert(ssrc.size() == sdst.size());
|
|
41
|
+
return ssrc.empty();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
std::vector<uint32_t> ssrc;
|
|
45
|
+
std::vector<uint32_t> sdst;
|
|
46
|
+
};
|
|
47
|
+
|
|
38
48
|
// for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
|
|
39
49
|
// KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
|
|
40
50
|
struct slot_info {
|
|
41
51
|
// data for ggml_set_rows
|
|
42
52
|
using idx_vec_t = std::vector<uint32_t>;
|
|
43
53
|
|
|
44
|
-
|
|
54
|
+
// number of streams: ns = s1 - s0 + 1
|
|
55
|
+
llama_seq_id s0;
|
|
56
|
+
llama_seq_id s1;
|
|
57
|
+
|
|
58
|
+
std::vector<llama_seq_id> strm; // [ns]
|
|
59
|
+
std::vector<idx_vec_t> idxs; // [ns]
|
|
45
60
|
|
|
46
61
|
uint32_t head() const {
|
|
47
|
-
|
|
62
|
+
GGML_ASSERT(idxs.size() == 1);
|
|
63
|
+
GGML_ASSERT(!idxs[0].empty());
|
|
64
|
+
|
|
65
|
+
return idxs[0][0];
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
void resize(size_t n) {
|
|
69
|
+
strm.resize(n);
|
|
70
|
+
idxs.resize(n);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
size_t size() const {
|
|
74
|
+
GGML_ASSERT(idxs.size() == strm.size());
|
|
75
|
+
GGML_ASSERT(!idxs.empty());
|
|
76
|
+
|
|
77
|
+
return idxs[0].size();
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
size_t n_stream() const {
|
|
81
|
+
return strm.size();
|
|
48
82
|
}
|
|
49
83
|
|
|
50
84
|
bool empty() const {
|
|
@@ -54,9 +88,6 @@ public:
|
|
|
54
88
|
void clear() {
|
|
55
89
|
idxs.clear();
|
|
56
90
|
}
|
|
57
|
-
|
|
58
|
-
// TODO: implement
|
|
59
|
-
//std::vector<idx_vec_t> seq_idxs;
|
|
60
91
|
};
|
|
61
92
|
|
|
62
93
|
using slot_info_vec_t = std::vector<slot_info>;
|
|
@@ -68,6 +99,7 @@ public:
|
|
|
68
99
|
ggml_type type_v,
|
|
69
100
|
bool v_trans,
|
|
70
101
|
bool offload,
|
|
102
|
+
bool unified,
|
|
71
103
|
uint32_t kv_size,
|
|
72
104
|
uint32_t n_seq_max,
|
|
73
105
|
uint32_t n_pad,
|
|
@@ -111,7 +143,8 @@ public:
|
|
|
111
143
|
// llama_kv_cache_unified specific API
|
|
112
144
|
//
|
|
113
145
|
|
|
114
|
-
uint32_t get_size()
|
|
146
|
+
uint32_t get_size() const;
|
|
147
|
+
uint32_t get_n_stream() const;
|
|
115
148
|
|
|
116
149
|
bool get_has_shift() const;
|
|
117
150
|
|
|
@@ -121,9 +154,12 @@ public:
|
|
|
121
154
|
|
|
122
155
|
uint32_t get_n_kv() const;
|
|
123
156
|
|
|
157
|
+
// TODO: temporary
|
|
158
|
+
bool get_supports_set_rows() const;
|
|
159
|
+
|
|
124
160
|
// get views of the current state of the cache
|
|
125
|
-
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
|
126
|
-
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
|
161
|
+
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
162
|
+
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
127
163
|
|
|
128
164
|
// store k_cur and v_cur in the cache based on the provided head location
|
|
129
165
|
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
|
@@ -137,7 +173,7 @@ public:
|
|
|
137
173
|
// return empty vector on failure
|
|
138
174
|
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
|
139
175
|
|
|
140
|
-
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
|
|
176
|
+
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
|
|
141
177
|
|
|
142
178
|
// find a slot of kv cells that can hold the ubatch
|
|
143
179
|
// if cont == true, then the slot must be continuous
|
|
@@ -157,8 +193,9 @@ public:
|
|
|
157
193
|
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
|
158
194
|
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
|
159
195
|
|
|
196
|
+
void set_input_k_shift(ggml_tensor * dst) const;
|
|
197
|
+
|
|
160
198
|
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
|
161
|
-
void set_input_k_shift (ggml_tensor * dst) const;
|
|
162
199
|
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
|
163
200
|
|
|
164
201
|
private:
|
|
@@ -172,15 +209,15 @@ private:
|
|
|
172
209
|
|
|
173
210
|
ggml_tensor * k;
|
|
174
211
|
ggml_tensor * v;
|
|
212
|
+
|
|
213
|
+
std::vector<ggml_tensor *> k_stream;
|
|
214
|
+
std::vector<ggml_tensor *> v_stream;
|
|
175
215
|
};
|
|
176
216
|
|
|
177
217
|
bool v_trans = true; // the value tensor is transposed
|
|
178
218
|
|
|
179
|
-
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
|
180
|
-
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
|
181
|
-
uint32_t head = 0;
|
|
182
|
-
|
|
183
219
|
const uint32_t n_seq_max = 1;
|
|
220
|
+
const uint32_t n_stream = 1;
|
|
184
221
|
|
|
185
222
|
// required padding
|
|
186
223
|
const uint32_t n_pad = 1;
|
|
@@ -193,14 +230,24 @@ private:
|
|
|
193
230
|
|
|
194
231
|
// env: LLAMA_SET_ROWS (temporary)
|
|
195
232
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
196
|
-
|
|
233
|
+
bool supports_set_rows = false;
|
|
197
234
|
|
|
198
235
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
199
236
|
|
|
200
237
|
std::vector<ggml_context_ptr> ctxs;
|
|
201
238
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
202
239
|
|
|
203
|
-
|
|
240
|
+
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
|
241
|
+
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
|
242
|
+
std::vector<uint32_t> v_heads;
|
|
243
|
+
|
|
244
|
+
std::vector<llama_kv_cells_unified> v_cells;
|
|
245
|
+
|
|
246
|
+
// maps from a sequence id to a stream id
|
|
247
|
+
std::vector<uint32_t> seq_to_stream;
|
|
248
|
+
|
|
249
|
+
// pending stream copies that will be applied during the next update
|
|
250
|
+
stream_copy_info sc_info;
|
|
204
251
|
|
|
205
252
|
std::vector<kv_layer> layers;
|
|
206
253
|
|
|
@@ -226,29 +273,34 @@ private:
|
|
|
226
273
|
float freq_base,
|
|
227
274
|
float freq_scale) const;
|
|
228
275
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
ggml_cgraph * gf) const;
|
|
276
|
+
ggml_cgraph * build_graph_shift(
|
|
277
|
+
llm_graph_result * res,
|
|
278
|
+
llama_context * lctx) const;
|
|
233
279
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
ggml_cgraph * gf,
|
|
280
|
+
ggml_cgraph * build_graph_defrag(
|
|
281
|
+
llm_graph_result * res,
|
|
282
|
+
llama_context * lctx,
|
|
238
283
|
const defrag_info & dinfo) const;
|
|
239
284
|
|
|
240
|
-
|
|
241
|
-
|
|
285
|
+
struct cell_ranges_t {
|
|
286
|
+
uint32_t strm;
|
|
242
287
|
|
|
243
|
-
|
|
244
|
-
|
|
288
|
+
std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
|
292
|
+
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
|
293
|
+
|
|
294
|
+
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
|
295
|
+
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
|
245
296
|
};
|
|
246
297
|
|
|
247
298
|
class llama_kv_cache_unified_context : public llama_memory_context_i {
|
|
248
299
|
public:
|
|
249
300
|
// some shorthands
|
|
250
|
-
using slot_info_vec_t
|
|
251
|
-
using defrag_info
|
|
301
|
+
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
|
302
|
+
using defrag_info = llama_kv_cache_unified::defrag_info;
|
|
303
|
+
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
|
|
252
304
|
|
|
253
305
|
// used for errors
|
|
254
306
|
llama_kv_cache_unified_context(llama_memory_status status);
|
|
@@ -262,7 +314,8 @@ public:
|
|
|
262
314
|
llama_kv_cache_unified * kv,
|
|
263
315
|
llama_context * lctx,
|
|
264
316
|
bool do_shift,
|
|
265
|
-
defrag_info dinfo
|
|
317
|
+
defrag_info dinfo,
|
|
318
|
+
stream_copy_info sc_info);
|
|
266
319
|
|
|
267
320
|
// used to create a batch procesing context from a batch
|
|
268
321
|
llama_kv_cache_unified_context(
|
|
@@ -288,6 +341,9 @@ public:
|
|
|
288
341
|
|
|
289
342
|
uint32_t get_n_kv() const;
|
|
290
343
|
|
|
344
|
+
// TODO: temporary
|
|
345
|
+
bool get_supports_set_rows() const;
|
|
346
|
+
|
|
291
347
|
// get views of the current state of the cache
|
|
292
348
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
|
293
349
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
@@ -320,6 +376,8 @@ private:
|
|
|
320
376
|
|
|
321
377
|
defrag_info dinfo;
|
|
322
378
|
|
|
379
|
+
stream_copy_info sc_info;
|
|
380
|
+
|
|
323
381
|
//
|
|
324
382
|
// batch processing context
|
|
325
383
|
//
|
|
@@ -25,9 +25,6 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
25
25
|
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
|
26
26
|
const int32_t n_layer = hparams.n_layer;
|
|
27
27
|
|
|
28
|
-
LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n",
|
|
29
|
-
__func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer);
|
|
30
|
-
|
|
31
28
|
head = 0;
|
|
32
29
|
size = mem_size;
|
|
33
30
|
used = 0;
|
|
@@ -84,7 +81,7 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
84
81
|
|
|
85
82
|
ggml_context * ctx = ctx_for_buft(buft);
|
|
86
83
|
if (!ctx) {
|
|
87
|
-
throw std::runtime_error("failed to create ggml context for
|
|
84
|
+
throw std::runtime_error("failed to create ggml context for rs cache");
|
|
88
85
|
}
|
|
89
86
|
|
|
90
87
|
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
|
|
@@ -102,10 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
102
99
|
|
|
103
100
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
104
101
|
if (!buf) {
|
|
105
|
-
throw std::runtime_error("failed to allocate buffer for
|
|
102
|
+
throw std::runtime_error("failed to allocate buffer for rs cache");
|
|
106
103
|
}
|
|
107
104
|
ggml_backend_buffer_clear(buf, 0);
|
|
108
|
-
LLAMA_LOG_INFO("%s: %10s
|
|
105
|
+
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
109
106
|
bufs.emplace_back(buf);
|
|
110
107
|
}
|
|
111
108
|
|
|
@@ -113,8 +110,8 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
113
110
|
const size_t memory_size_r = size_r_bytes();
|
|
114
111
|
const size_t memory_size_s = size_s_bytes();
|
|
115
112
|
|
|
116
|
-
LLAMA_LOG_INFO("%s:
|
|
117
|
-
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f),
|
|
113
|
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
|
|
114
|
+
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
|
|
118
115
|
ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
|
|
119
116
|
ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
|
|
120
117
|
}
|
|
@@ -449,7 +446,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
|
|
|
449
446
|
// A slot should be always be contiguous.
|
|
450
447
|
|
|
451
448
|
// can only process batches with an equal number of new tokens in each sequence
|
|
452
|
-
GGML_ASSERT(ubatch.equal_seqs);
|
|
449
|
+
GGML_ASSERT(ubatch.equal_seqs());
|
|
453
450
|
|
|
454
451
|
int32_t min = size - 1;
|
|
455
452
|
int32_t max = 0;
|