@fugood/llama.node 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/LlamaContext.cpp +9 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +132 -41
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +311 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +46 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -22
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +81 -70
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
|
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
|
|
194
194
|
return kv_base->get_size() == kv_swa->get_size();
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
198
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
199
|
+
kv_base->state_write(io, seq_id, flags);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
kv_swa->state_write(io, seq_id, flags);
|
|
200
203
|
}
|
|
201
204
|
|
|
202
|
-
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
+
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
206
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
207
|
+
kv_base->state_read(io, seq_id, flags);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
kv_swa->state_read(io, seq_id, flags);
|
|
205
211
|
}
|
|
206
212
|
|
|
207
213
|
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
|
@@ -56,8 +56,8 @@ public:
|
|
|
56
56
|
|
|
57
57
|
// state write/load
|
|
58
58
|
|
|
59
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
60
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
|
59
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
60
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
61
61
|
|
|
62
62
|
//
|
|
63
63
|
// llama_kv_cache_unified_iswa specific API
|
|
@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
|
|
223
223
|
}
|
|
224
224
|
|
|
225
225
|
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
226
|
-
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
227
|
-
|
|
228
|
-
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
229
|
-
auto & head = v_heads[seq_to_stream[seq_id]];
|
|
230
|
-
|
|
231
|
-
uint32_t new_head = cells.size();
|
|
226
|
+
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
232
227
|
|
|
233
228
|
if (p0 < 0) {
|
|
234
229
|
p0 = 0;
|
|
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
239
234
|
}
|
|
240
235
|
|
|
241
236
|
if (seq_id >= 0) {
|
|
237
|
+
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
238
|
+
auto & head = v_heads[seq_to_stream[seq_id]];
|
|
239
|
+
|
|
240
|
+
uint32_t new_head = cells.size();
|
|
241
|
+
|
|
242
242
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
243
243
|
if (!cells.pos_in(i, p0, p1)) {
|
|
244
244
|
continue;
|
|
@@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
250
250
|
}
|
|
251
251
|
}
|
|
252
252
|
}
|
|
253
|
+
|
|
254
|
+
// If we freed up a slot, set head to it so searching can start there.
|
|
255
|
+
if (new_head != cells.size() && new_head < head) {
|
|
256
|
+
head = new_head;
|
|
257
|
+
}
|
|
253
258
|
} else {
|
|
254
259
|
// match any sequence
|
|
255
|
-
for (uint32_t
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
260
|
+
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
261
|
+
auto & cells = v_cells[s];
|
|
262
|
+
auto & head = v_heads[s];
|
|
259
263
|
|
|
260
|
-
cells.
|
|
264
|
+
uint32_t new_head = cells.size();
|
|
261
265
|
|
|
262
|
-
|
|
263
|
-
|
|
266
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
267
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
cells.rm(i);
|
|
272
|
+
|
|
273
|
+
if (new_head == cells.size()) {
|
|
274
|
+
new_head = i;
|
|
275
|
+
}
|
|
264
276
|
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
277
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
278
|
+
// If we freed up a slot, set head to it so searching can start there.
|
|
279
|
+
if (new_head != cells.size() && new_head < head) {
|
|
280
|
+
head = new_head;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
271
283
|
}
|
|
272
284
|
|
|
273
285
|
return true;
|
|
@@ -738,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|
|
738
750
|
}
|
|
739
751
|
|
|
740
752
|
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
|
741
|
-
if (debug > 0) {
|
|
742
|
-
const auto & cells = v_cells[seq_to_stream[1]];
|
|
743
753
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
754
|
+
if (debug > 0) {
|
|
755
|
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
756
|
+
const auto seq_id = ubatch.seq_id_unq[s];
|
|
757
|
+
const auto stream_id = seq_to_stream[seq_id];
|
|
758
|
+
const auto & cells = v_cells[stream_id];
|
|
759
|
+
const uint32_t head_cur = v_heads[stream_id];
|
|
760
|
+
|
|
761
|
+
LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
|
|
762
|
+
__func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
|
|
763
|
+
|
|
764
|
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
|
765
|
+
std::string ss;
|
|
766
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
767
|
+
if (cells.is_empty(i)) {
|
|
768
|
+
ss += '.';
|
|
769
|
+
} else {
|
|
770
|
+
assert(cells.seq_count(i) >= 1);
|
|
748
771
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
772
|
+
if (cells.seq_count(i) == 1) {
|
|
773
|
+
ss += std::to_string(cells.seq_get(i));
|
|
774
|
+
} else {
|
|
775
|
+
ss += 'M';
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
if (i%256 == 255) {
|
|
779
|
+
ss += " *";
|
|
780
|
+
ss += '\n';
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
784
|
+
}
|
|
756
785
|
|
|
757
|
-
|
|
758
|
-
|
|
786
|
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
|
787
|
+
std::string ss;
|
|
788
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
789
|
+
std::string cur;
|
|
790
|
+
if (cells.is_empty(i)) {
|
|
791
|
+
cur = '.';
|
|
759
792
|
} else {
|
|
760
|
-
|
|
793
|
+
cur = std::to_string(cells.pos_get(i));
|
|
794
|
+
}
|
|
795
|
+
const int n = cur.size();
|
|
796
|
+
for (int j = 0; j < 5 - n; ++j) {
|
|
797
|
+
cur += ' ';
|
|
798
|
+
}
|
|
799
|
+
ss += cur;
|
|
800
|
+
if (i%256 == 255) {
|
|
801
|
+
ss += " *";
|
|
802
|
+
}
|
|
803
|
+
if (i%64 == 63) {
|
|
804
|
+
ss += '\n';
|
|
761
805
|
}
|
|
762
806
|
}
|
|
763
|
-
|
|
764
|
-
ss += " *";
|
|
765
|
-
ss += '\n';
|
|
766
|
-
}
|
|
807
|
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
767
808
|
}
|
|
768
|
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
769
|
-
}
|
|
770
809
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
std::string cur;
|
|
775
|
-
if (cells.is_empty(i)) {
|
|
776
|
-
cur = '.';
|
|
777
|
-
} else {
|
|
778
|
-
cur = std::to_string(cells.pos_get(i));
|
|
779
|
-
}
|
|
780
|
-
const int n = cur.size();
|
|
781
|
-
for (int j = 0; j < 5 - n; ++j) {
|
|
782
|
-
cur += ' ';
|
|
783
|
-
}
|
|
784
|
-
ss += cur;
|
|
785
|
-
if (i%256 == 255) {
|
|
786
|
-
ss += " *";
|
|
787
|
-
}
|
|
788
|
-
if (i%64 == 63) {
|
|
789
|
-
ss += '\n';
|
|
810
|
+
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
|
811
|
+
if (cells.seq_pos_min(s) < 0) {
|
|
812
|
+
continue;
|
|
790
813
|
}
|
|
791
|
-
}
|
|
792
|
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
793
|
-
}
|
|
794
814
|
|
|
795
|
-
|
|
796
|
-
if (cells.seq_pos_min(s) < 0) {
|
|
797
|
-
continue;
|
|
815
|
+
LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
|
798
816
|
}
|
|
799
|
-
|
|
800
|
-
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
|
801
817
|
}
|
|
802
818
|
}
|
|
803
819
|
|
|
@@ -1812,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
|
1812
1828
|
return false;
|
|
1813
1829
|
}
|
|
1814
1830
|
|
|
1815
|
-
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
1831
|
+
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1832
|
+
GGML_UNUSED(flags);
|
|
1833
|
+
|
|
1816
1834
|
io.write(&n_stream, sizeof(n_stream));
|
|
1817
1835
|
|
|
1818
1836
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1863,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
1863
1881
|
}
|
|
1864
1882
|
}
|
|
1865
1883
|
|
|
1866
|
-
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
1884
|
+
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1885
|
+
GGML_UNUSED(flags);
|
|
1886
|
+
|
|
1867
1887
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
1868
1888
|
|
|
1869
1889
|
uint32_t n_stream_cur;
|
|
@@ -136,8 +136,8 @@ public:
|
|
|
136
136
|
|
|
137
137
|
// state write/load
|
|
138
138
|
|
|
139
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
140
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
|
139
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
140
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
141
141
|
|
|
142
142
|
//
|
|
143
143
|
// llama_kv_cache_unified specific API
|
|
@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
165
165
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
166
166
|
}
|
|
167
167
|
|
|
168
|
-
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
168
|
+
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
169
|
+
GGML_UNUSED(flags);
|
|
170
|
+
|
|
169
171
|
mem_attn->state_write(io, seq_id);
|
|
170
172
|
mem_recr->state_write(io, seq_id);
|
|
171
173
|
}
|
|
172
174
|
|
|
173
|
-
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
175
|
+
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
176
|
+
GGML_UNUSED(flags);
|
|
177
|
+
|
|
174
178
|
mem_attn->state_read(io, seq_id);
|
|
175
179
|
mem_recr->state_read(io, seq_id);
|
|
176
180
|
}
|
|
@@ -74,8 +74,8 @@ public:
|
|
|
74
74
|
|
|
75
75
|
// state write/load
|
|
76
76
|
|
|
77
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
78
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
|
77
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
78
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
79
79
|
|
|
80
80
|
//
|
|
81
81
|
// llama_memory_hybrid specific API
|
|
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
|
|
680
680
|
return size_s_bytes;
|
|
681
681
|
}
|
|
682
682
|
|
|
683
|
-
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
683
|
+
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
684
|
+
GGML_UNUSED(flags);
|
|
685
|
+
|
|
684
686
|
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
|
685
687
|
uint32_t cell_count = 0;
|
|
686
688
|
|
|
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
718
720
|
state_write_data(io, cell_ranges);
|
|
719
721
|
}
|
|
720
722
|
|
|
721
|
-
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
723
|
+
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
724
|
+
GGML_UNUSED(flags);
|
|
725
|
+
|
|
722
726
|
uint32_t cell_count;
|
|
723
727
|
io.read_to(&cell_count, sizeof(cell_count));
|
|
724
728
|
|
|
@@ -63,8 +63,8 @@ public:
|
|
|
63
63
|
|
|
64
64
|
// state write/load
|
|
65
65
|
|
|
66
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
67
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
|
66
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
67
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
68
68
|
|
|
69
69
|
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
|
70
70
|
uint32_t size = 0; // total number of cells, shared across all sequences
|
|
@@ -104,8 +104,8 @@ struct llama_memory_i {
|
|
|
104
104
|
// state write/read
|
|
105
105
|
//
|
|
106
106
|
|
|
107
|
-
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
|
108
|
-
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
|
107
|
+
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
|
108
|
+
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
|
109
109
|
};
|
|
110
110
|
|
|
111
111
|
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|