@fugood/llama.node 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +44 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +104 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
- package/src/llama.cpp/include/llama.h +13 -47
- package/src/llama.cpp/src/llama-arch.cpp +298 -3
- package/src/llama.cpp/src/llama-arch.h +22 -1
- package/src/llama.cpp/src/llama-batch.cpp +103 -71
- package/src/llama.cpp/src/llama-batch.h +31 -18
- package/src/llama.cpp/src/llama-chat.cpp +59 -1
- package/src/llama.cpp/src/llama-chat.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +279 -180
- package/src/llama.cpp/src/llama-graph.h +183 -122
- package/src/llama.cpp/src/llama-hparams.cpp +47 -1
- package/src/llama.cpp/src/llama-hparams.h +12 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +3373 -743
- package/src/llama.cpp/src/llama-model.h +20 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +376 -10
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
18
18
|
bool v_trans,
|
|
19
19
|
bool offload,
|
|
20
20
|
bool swa_full,
|
|
21
|
+
bool unified,
|
|
21
22
|
uint32_t kv_size,
|
|
22
23
|
uint32_t n_seq_max,
|
|
23
24
|
uint32_t n_ubatch,
|
|
24
|
-
uint32_t n_pad) : hparams(model.hparams) {
|
|
25
|
+
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
|
25
26
|
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
|
26
27
|
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
|
27
28
|
|
|
28
29
|
const uint32_t size_base = kv_size;
|
|
29
30
|
|
|
30
|
-
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
|
|
31
|
+
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
|
|
31
32
|
|
|
32
33
|
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
|
33
34
|
if (swa_full) {
|
|
@@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
|
41
42
|
|
|
42
43
|
kv_base = std::make_unique<llama_kv_cache_unified>(
|
|
43
44
|
model, std::move(filter_base), type_k, type_v,
|
|
44
|
-
v_trans, offload, size_base, n_seq_max, n_pad,
|
|
45
|
+
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
|
45
46
|
0, LLAMA_SWA_TYPE_NONE);
|
|
46
47
|
|
|
47
48
|
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
|
48
49
|
|
|
49
50
|
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
|
50
51
|
model, std::move(filter_swa), type_k, type_v,
|
|
51
|
-
v_trans, offload, size_swa, n_seq_max, n_pad,
|
|
52
|
+
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
|
52
53
|
hparams.n_swa, hparams.swa_type);
|
|
53
54
|
}
|
|
54
55
|
|
|
@@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
100
101
|
|
|
101
102
|
// first try simple split
|
|
102
103
|
do {
|
|
104
|
+
if (!unified) {
|
|
105
|
+
// requires equal splits, so we skip the simple split
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
|
|
103
109
|
balloc.split_reset();
|
|
104
110
|
|
|
105
111
|
std::vector<llama_ubatch> ubatches;
|
|
@@ -113,20 +119,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
113
119
|
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
114
120
|
}
|
|
115
121
|
|
|
116
|
-
|
|
117
|
-
|
|
122
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
123
|
+
// failed to find a suitable split
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
|
128
|
+
if (sinfos_base.empty()) {
|
|
118
129
|
break;
|
|
119
130
|
}
|
|
120
131
|
|
|
121
|
-
auto
|
|
122
|
-
if (
|
|
132
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
|
133
|
+
if (sinfos_swa.empty()) {
|
|
123
134
|
break;
|
|
124
135
|
}
|
|
125
136
|
|
|
126
|
-
assert(
|
|
137
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
|
127
138
|
|
|
128
139
|
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
|
129
|
-
this, std::move(
|
|
140
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
130
141
|
} while (false);
|
|
131
142
|
|
|
132
143
|
// if it fails, try equal split
|
|
@@ -135,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
135
146
|
|
|
136
147
|
std::vector<llama_ubatch> ubatches;
|
|
137
148
|
while (true) {
|
|
138
|
-
auto ubatch = balloc.split_equal(n_ubatch);
|
|
149
|
+
auto ubatch = balloc.split_equal(n_ubatch, !unified);
|
|
139
150
|
|
|
140
151
|
if (ubatch.n_tokens == 0) {
|
|
141
152
|
break;
|
|
@@ -144,20 +155,25 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
|
|
144
155
|
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
145
156
|
}
|
|
146
157
|
|
|
147
|
-
|
|
148
|
-
|
|
158
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
159
|
+
// failed to find a suitable split
|
|
149
160
|
break;
|
|
150
161
|
}
|
|
151
162
|
|
|
152
|
-
auto
|
|
153
|
-
if (
|
|
163
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
|
164
|
+
if (sinfos_base.empty()) {
|
|
154
165
|
break;
|
|
155
166
|
}
|
|
156
167
|
|
|
157
|
-
|
|
168
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
|
169
|
+
if (sinfos_swa.empty()) {
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
|
158
174
|
|
|
159
175
|
return std::make_unique<llama_kv_cache_unified_iswa_context>(
|
|
160
|
-
this, std::move(
|
|
176
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
161
177
|
} while (false);
|
|
162
178
|
|
|
163
179
|
// TODO: if we fail again, we should attempt different splitting strategies
|
|
@@ -220,13 +236,13 @@ llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
|
|
220
236
|
|
|
221
237
|
llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
|
|
222
238
|
llama_kv_cache_unified_iswa * kv,
|
|
223
|
-
|
|
224
|
-
|
|
239
|
+
slot_info_vec_t sinfos_base,
|
|
240
|
+
slot_info_vec_t sinfos_swa,
|
|
225
241
|
std::vector<llama_ubatch> ubatches) :
|
|
226
242
|
ubatches(std::move(ubatches)),
|
|
227
243
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
228
|
-
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(
|
|
229
|
-
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(
|
|
244
|
+
ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
|
245
|
+
ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
|
230
246
|
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
231
247
|
}
|
|
232
248
|
|
|
@@ -246,7 +262,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
|
|
|
246
262
|
}
|
|
247
263
|
|
|
248
264
|
bool llama_kv_cache_unified_iswa_context::apply() {
|
|
249
|
-
assert(status
|
|
265
|
+
assert(!llama_memory_status_is_fail(status));
|
|
250
266
|
|
|
251
267
|
bool res = true;
|
|
252
268
|
|
|
@@ -20,6 +20,7 @@ public:
|
|
|
20
20
|
bool v_trans,
|
|
21
21
|
bool offload,
|
|
22
22
|
bool swa_full,
|
|
23
|
+
bool unified,
|
|
23
24
|
uint32_t kv_size,
|
|
24
25
|
uint32_t n_seq_max,
|
|
25
26
|
uint32_t n_ubatch,
|
|
@@ -68,12 +69,16 @@ public:
|
|
|
68
69
|
private:
|
|
69
70
|
const llama_hparams & hparams;
|
|
70
71
|
|
|
72
|
+
const bool unified;
|
|
73
|
+
|
|
71
74
|
std::unique_ptr<llama_kv_cache_unified> kv_base;
|
|
72
75
|
std::unique_ptr<llama_kv_cache_unified> kv_swa;
|
|
73
76
|
};
|
|
74
77
|
|
|
75
78
|
class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
|
|
76
79
|
public:
|
|
80
|
+
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
|
81
|
+
|
|
77
82
|
// used for errors
|
|
78
83
|
llama_kv_cache_unified_iswa_context(llama_memory_status status);
|
|
79
84
|
|
|
@@ -90,8 +95,8 @@ public:
|
|
|
90
95
|
// used to create a batch processing context from a batch
|
|
91
96
|
llama_kv_cache_unified_iswa_context(
|
|
92
97
|
llama_kv_cache_unified_iswa * kv,
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
slot_info_vec_t sinfos_base,
|
|
99
|
+
slot_info_vec_t sinfos_swa,
|
|
95
100
|
std::vector<llama_ubatch> ubatches);
|
|
96
101
|
|
|
97
102
|
virtual ~llama_kv_cache_unified_iswa_context();
|