@fugood/llama.node 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +20 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +13 -4
- package/src/llama.cpp/common/chat.cpp +33 -2
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -197
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +449 -246
- package/src/llama.cpp/src/llama-model.h +2 -0
|
@@ -14,27 +14,16 @@ struct llama_model;
|
|
|
14
14
|
struct llama_context;
|
|
15
15
|
|
|
16
16
|
//
|
|
17
|
-
//
|
|
17
|
+
// llama_kv_cache
|
|
18
18
|
//
|
|
19
19
|
|
|
20
|
-
class
|
|
20
|
+
class llama_kv_cache : public llama_memory_i {
|
|
21
21
|
public:
|
|
22
22
|
static uint32_t get_padding(const llama_cparams & cparams);
|
|
23
23
|
|
|
24
24
|
// this callback is used to filter out layers that should not be included in the cache
|
|
25
25
|
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
26
26
|
|
|
27
|
-
struct defrag_info {
|
|
28
|
-
bool empty() const {
|
|
29
|
-
return ids.empty();
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
// contains information about which cell moves where:
|
|
33
|
-
// - cell i moves to ids[i]
|
|
34
|
-
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
|
|
35
|
-
std::vector<uint32_t> ids;
|
|
36
|
-
};
|
|
37
|
-
|
|
38
27
|
struct stream_copy_info {
|
|
39
28
|
bool empty() const {
|
|
40
29
|
assert(ssrc.size() == sdst.size());
|
|
@@ -92,7 +81,7 @@ public:
|
|
|
92
81
|
|
|
93
82
|
using slot_info_vec_t = std::vector<slot_info>;
|
|
94
83
|
|
|
95
|
-
|
|
84
|
+
llama_kv_cache(
|
|
96
85
|
const llama_model & model,
|
|
97
86
|
layer_filter_cb && filter,
|
|
98
87
|
ggml_type type_k,
|
|
@@ -106,7 +95,7 @@ public:
|
|
|
106
95
|
uint32_t n_swa,
|
|
107
96
|
llama_swa_type swa_type);
|
|
108
97
|
|
|
109
|
-
~
|
|
98
|
+
~llama_kv_cache() = default;
|
|
110
99
|
|
|
111
100
|
//
|
|
112
101
|
// llama_memory_i
|
|
@@ -140,7 +129,7 @@ public:
|
|
|
140
129
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
141
130
|
|
|
142
131
|
//
|
|
143
|
-
//
|
|
132
|
+
// llama_kv_cache specific API
|
|
144
133
|
//
|
|
145
134
|
|
|
146
135
|
uint32_t get_size() const;
|
|
@@ -173,7 +162,7 @@ public:
|
|
|
173
162
|
// return empty vector on failure
|
|
174
163
|
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
|
175
164
|
|
|
176
|
-
bool update(llama_context * lctx, bool do_shift, const
|
|
165
|
+
bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
|
|
177
166
|
|
|
178
167
|
// find a slot of kv cells that can hold the ubatch
|
|
179
168
|
// if cont == true, then the slot must be continuous
|
|
@@ -241,7 +230,7 @@ private:
|
|
|
241
230
|
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
|
242
231
|
std::vector<uint32_t> v_heads;
|
|
243
232
|
|
|
244
|
-
std::vector<
|
|
233
|
+
std::vector<llama_kv_cells> v_cells;
|
|
245
234
|
|
|
246
235
|
// maps from a sequence id to a stream id
|
|
247
236
|
std::vector<uint32_t> seq_to_stream;
|
|
@@ -254,9 +243,6 @@ private:
|
|
|
254
243
|
// model layer id -> KV cache layer id
|
|
255
244
|
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
|
256
245
|
|
|
257
|
-
// return non-empty vector if cells have been moved
|
|
258
|
-
defrag_info defrag_prepare(int32_t n_max_nodes) const;
|
|
259
|
-
|
|
260
246
|
size_t total_size() const;
|
|
261
247
|
|
|
262
248
|
size_t size_k_bytes() const;
|
|
@@ -277,11 +263,6 @@ private:
|
|
|
277
263
|
llm_graph_result * res,
|
|
278
264
|
llama_context * lctx) const;
|
|
279
265
|
|
|
280
|
-
ggml_cgraph * build_graph_defrag(
|
|
281
|
-
llm_graph_result * res,
|
|
282
|
-
llama_context * lctx,
|
|
283
|
-
const defrag_info & dinfo) const;
|
|
284
|
-
|
|
285
266
|
struct cell_ranges_t {
|
|
286
267
|
uint32_t strm;
|
|
287
268
|
|
|
@@ -295,35 +276,33 @@ private:
|
|
|
295
276
|
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
|
296
277
|
};
|
|
297
278
|
|
|
298
|
-
class
|
|
279
|
+
class llama_kv_cache_context : public llama_memory_context_i {
|
|
299
280
|
public:
|
|
300
281
|
// some shorthands
|
|
301
|
-
using slot_info_vec_t =
|
|
302
|
-
using
|
|
303
|
-
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
|
|
282
|
+
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
|
283
|
+
using stream_copy_info = llama_kv_cache::stream_copy_info;
|
|
304
284
|
|
|
305
285
|
// used for errors
|
|
306
|
-
|
|
286
|
+
llama_kv_cache_context(llama_memory_status status);
|
|
307
287
|
|
|
308
288
|
// used to create a full-cache context
|
|
309
|
-
|
|
310
|
-
|
|
289
|
+
llama_kv_cache_context(
|
|
290
|
+
llama_kv_cache * kv);
|
|
311
291
|
|
|
312
292
|
// used to create an update context
|
|
313
|
-
|
|
314
|
-
|
|
293
|
+
llama_kv_cache_context(
|
|
294
|
+
llama_kv_cache * kv,
|
|
315
295
|
llama_context * lctx,
|
|
316
296
|
bool do_shift,
|
|
317
|
-
defrag_info dinfo,
|
|
318
297
|
stream_copy_info sc_info);
|
|
319
298
|
|
|
320
299
|
// used to create a batch procesing context from a batch
|
|
321
|
-
|
|
322
|
-
|
|
300
|
+
llama_kv_cache_context(
|
|
301
|
+
llama_kv_cache * kv,
|
|
323
302
|
slot_info_vec_t sinfos,
|
|
324
303
|
std::vector<llama_ubatch> ubatches);
|
|
325
304
|
|
|
326
|
-
virtual ~
|
|
305
|
+
virtual ~llama_kv_cache_context();
|
|
327
306
|
|
|
328
307
|
//
|
|
329
308
|
// llama_memory_context_i
|
|
@@ -336,7 +315,7 @@ public:
|
|
|
336
315
|
const llama_ubatch & get_ubatch() const override;
|
|
337
316
|
|
|
338
317
|
//
|
|
339
|
-
//
|
|
318
|
+
// llama_kv_cache_context specific API
|
|
340
319
|
//
|
|
341
320
|
|
|
342
321
|
uint32_t get_n_kv() const;
|
|
@@ -365,7 +344,7 @@ public:
|
|
|
365
344
|
private:
|
|
366
345
|
llama_memory_status status;
|
|
367
346
|
|
|
368
|
-
|
|
347
|
+
llama_kv_cache * kv;
|
|
369
348
|
llama_context * lctx;
|
|
370
349
|
|
|
371
350
|
//
|
|
@@ -374,8 +353,6 @@ private:
|
|
|
374
353
|
|
|
375
354
|
bool do_shift = false;
|
|
376
355
|
|
|
377
|
-
defrag_info dinfo;
|
|
378
|
-
|
|
379
356
|
stream_copy_info sc_info;
|
|
380
357
|
|
|
381
358
|
//
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
// meta information about KV cells that can be part of multiple sequences at the same time
|
|
13
13
|
// TODO: add unit tests
|
|
14
|
-
class
|
|
14
|
+
class llama_kv_cells {
|
|
15
15
|
public:
|
|
16
16
|
void reset() {
|
|
17
17
|
for (uint32_t i = 0; i < pos.size(); ++i) {
|
|
@@ -77,30 +77,30 @@ public:
|
|
|
77
77
|
}
|
|
78
78
|
|
|
79
79
|
// move cell isrc to idst (used during defrag)
|
|
80
|
-
void mv(uint32_t isrc, uint32_t idst) {
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
//void mv(uint32_t isrc, uint32_t idst) {
|
|
81
|
+
// assert(isrc < pos.size());
|
|
82
|
+
// assert(idst < pos.size());
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
// assert(pos[idst] == -1);
|
|
85
|
+
// assert(pos[isrc] != -1);
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
// pos [idst] = pos [isrc];
|
|
88
|
+
// shift[idst] = shift[isrc];
|
|
89
|
+
// seq [idst] = seq [isrc];
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
// pos [isrc] = -1;
|
|
92
|
+
// shift[isrc] = 0;
|
|
93
|
+
// seq [isrc].reset();
|
|
94
94
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
}
|
|
95
|
+
// used.erase (isrc);
|
|
96
|
+
// used.insert(idst);
|
|
97
|
+
//}
|
|
98
98
|
|
|
99
99
|
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
|
|
100
|
-
|
|
100
|
+
llama_kv_cells cp(uint32_t i, uint32_t n) const {
|
|
101
101
|
assert(i + n <= pos.size());
|
|
102
102
|
|
|
103
|
-
|
|
103
|
+
llama_kv_cells res;
|
|
104
104
|
|
|
105
105
|
res.resize(n);
|
|
106
106
|
|
|
@@ -117,8 +117,8 @@ public:
|
|
|
117
117
|
}
|
|
118
118
|
|
|
119
119
|
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
|
|
121
|
+
llama_kv_cells res;
|
|
122
122
|
|
|
123
123
|
res.resize(idxs.size());
|
|
124
124
|
|
|
@@ -135,7 +135,7 @@ public:
|
|
|
135
135
|
}
|
|
136
136
|
|
|
137
137
|
// set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
|
|
138
|
-
void set(uint32_t i, const
|
|
138
|
+
void set(uint32_t i, const llama_kv_cells & other) {
|
|
139
139
|
assert(i + other.pos.size() <= pos.size());
|
|
140
140
|
|
|
141
141
|
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
|
@@ -165,7 +165,7 @@ public:
|
|
|
165
165
|
}
|
|
166
166
|
|
|
167
167
|
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
|
168
|
-
void set(const std::vector<uint32_t> & idxs, const
|
|
168
|
+
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
|
|
169
169
|
assert(idxs.size() == other.pos.size());
|
|
170
170
|
|
|
171
171
|
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
|
@@ -30,7 +30,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
30
30
|
layer_filter_cb && filter_attn,
|
|
31
31
|
layer_filter_cb && filter_recr) :
|
|
32
32
|
hparams(model.hparams),
|
|
33
|
-
mem_attn(new
|
|
33
|
+
mem_attn(new llama_kv_cache(
|
|
34
34
|
model,
|
|
35
35
|
filter_attn == nullptr ?
|
|
36
36
|
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
|
@@ -179,7 +179,7 @@ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id,
|
|
|
179
179
|
mem_recr->state_read(io, seq_id);
|
|
180
180
|
}
|
|
181
181
|
|
|
182
|
-
|
|
182
|
+
llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
|
|
183
183
|
return mem_attn.get();
|
|
184
184
|
}
|
|
185
185
|
|
|
@@ -210,7 +210,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|
|
210
210
|
std::vector<llama_ubatch> ubatches) :
|
|
211
211
|
ubatches(std::move(ubatches)),
|
|
212
212
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
213
|
-
ctx_attn(new
|
|
213
|
+
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
|
214
214
|
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
|
215
215
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
216
216
|
}
|
|
@@ -248,8 +248,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
|
|
|
248
248
|
return ubatches[i_next];
|
|
249
249
|
}
|
|
250
250
|
|
|
251
|
-
const
|
|
252
|
-
return static_cast<const
|
|
251
|
+
const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
|
|
252
|
+
return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
|
|
253
253
|
}
|
|
254
254
|
|
|
255
255
|
const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama-batch.h"
|
|
4
4
|
#include "llama-graph.h"
|
|
5
|
-
#include "llama-kv-cache
|
|
5
|
+
#include "llama-kv-cache.h"
|
|
6
6
|
#include "llama-memory.h"
|
|
7
7
|
#include "llama-memory-recurrent.h"
|
|
8
8
|
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
// llama_memory_hybrid
|
|
14
14
|
//
|
|
15
15
|
|
|
16
|
-
// utilizes instances of llama_memory_recurrent and
|
|
16
|
+
// utilizes instances of llama_memory_recurrent and llama_kv_cache to
|
|
17
17
|
// support models where each layer may be either attention-based or recurrent
|
|
18
18
|
|
|
19
19
|
class llama_memory_hybrid : public llama_memory_i {
|
|
@@ -81,19 +81,19 @@ public:
|
|
|
81
81
|
// llama_memory_hybrid specific API
|
|
82
82
|
//
|
|
83
83
|
|
|
84
|
-
|
|
84
|
+
llama_kv_cache * get_mem_attn() const;
|
|
85
85
|
llama_memory_recurrent * get_mem_recr() const;
|
|
86
86
|
|
|
87
87
|
private:
|
|
88
88
|
const llama_hparams & hparams;
|
|
89
89
|
|
|
90
|
-
const std::unique_ptr<
|
|
90
|
+
const std::unique_ptr<llama_kv_cache> mem_attn;
|
|
91
91
|
const std::unique_ptr<llama_memory_recurrent> mem_recr;
|
|
92
92
|
};
|
|
93
93
|
|
|
94
94
|
class llama_memory_hybrid_context : public llama_memory_context_i {
|
|
95
95
|
public:
|
|
96
|
-
using slot_info_vec_t =
|
|
96
|
+
using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
|
|
97
97
|
|
|
98
98
|
// init failure
|
|
99
99
|
explicit llama_memory_hybrid_context(llama_memory_status status);
|
|
@@ -125,7 +125,7 @@ public:
|
|
|
125
125
|
// llama_memory_hybrid_context
|
|
126
126
|
//
|
|
127
127
|
|
|
128
|
-
const
|
|
128
|
+
const llama_kv_cache_context * get_attn() const;
|
|
129
129
|
const llama_memory_recurrent_context * get_recr() const;
|
|
130
130
|
|
|
131
131
|
private:
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
//
|
|
13
13
|
|
|
14
14
|
// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
|
|
15
|
-
// see the implementation of
|
|
15
|
+
// see the implementation of llama_kv_cache_context_i for an example how to do it
|
|
16
16
|
class llama_memory_recurrent : public llama_memory_i {
|
|
17
17
|
public:
|
|
18
18
|
|
|
@@ -36,8 +36,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
|
|
|
36
36
|
|
|
37
37
|
// the interface for managing the memory context during batch processing
|
|
38
38
|
// this interface is implemented per memory type. see:
|
|
39
|
-
// -
|
|
40
|
-
// -
|
|
39
|
+
// - llama_kv_cache_context
|
|
40
|
+
// - llama_kv_cache_iswa_context
|
|
41
41
|
// ...
|
|
42
42
|
//
|
|
43
43
|
// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
|
|
@@ -77,7 +77,7 @@ struct llama_memory_i {
|
|
|
77
77
|
// simulate full cache, used for allocating worst-case compute buffers
|
|
78
78
|
virtual llama_memory_context_ptr init_full() = 0;
|
|
79
79
|
|
|
80
|
-
// prepare for any pending memory updates, such as shifts,
|
|
80
|
+
// prepare for any pending memory updates, such as shifts, copies, etc.
|
|
81
81
|
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
|
82
82
|
virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
|
83
83
|
|
|
@@ -109,8 +109,3 @@ struct llama_memory_i {
|
|
|
109
109
|
};
|
|
110
110
|
|
|
111
111
|
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
|
112
|
-
|
|
113
|
-
// TODO: temporary until the llama_kv_cache is removed from the public API
|
|
114
|
-
struct llama_kv_cache : public llama_memory_i {
|
|
115
|
-
virtual ~llama_kv_cache() = default;
|
|
116
|
-
};
|