@fugood/llama.node 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +124 -40
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +312 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +41 -7
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -17
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +1 -0
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
|
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
|
|
194
194
|
return kv_base->get_size() == kv_swa->get_size();
|
|
195
195
|
}
|
|
196
196
|
|
|
197
|
-
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
198
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
199
|
+
kv_base->state_write(io, seq_id, flags);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
kv_swa->state_write(io, seq_id, flags);
|
|
200
203
|
}
|
|
201
204
|
|
|
202
|
-
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
203
|
-
|
|
204
|
-
|
|
205
|
+
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
206
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
|
207
|
+
kv_base->state_read(io, seq_id, flags);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
kv_swa->state_read(io, seq_id, flags);
|
|
205
211
|
}
|
|
206
212
|
|
|
207
213
|
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
|
@@ -56,8 +56,8 @@ public:
|
|
|
56
56
|
|
|
57
57
|
// state write/load
|
|
58
58
|
|
|
59
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
60
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
|
59
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
60
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
61
61
|
|
|
62
62
|
//
|
|
63
63
|
// llama_kv_cache_unified_iswa specific API
|
|
@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
|
|
223
223
|
}
|
|
224
224
|
|
|
225
225
|
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
226
|
-
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
227
|
-
|
|
228
|
-
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
229
|
-
auto & head = v_heads[seq_to_stream[seq_id]];
|
|
230
|
-
|
|
231
|
-
uint32_t new_head = cells.size();
|
|
226
|
+
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
232
227
|
|
|
233
228
|
if (p0 < 0) {
|
|
234
229
|
p0 = 0;
|
|
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
239
234
|
}
|
|
240
235
|
|
|
241
236
|
if (seq_id >= 0) {
|
|
237
|
+
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
238
|
+
auto & head = v_heads[seq_to_stream[seq_id]];
|
|
239
|
+
|
|
240
|
+
uint32_t new_head = cells.size();
|
|
241
|
+
|
|
242
242
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
243
243
|
if (!cells.pos_in(i, p0, p1)) {
|
|
244
244
|
continue;
|
|
@@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
250
250
|
}
|
|
251
251
|
}
|
|
252
252
|
}
|
|
253
|
+
|
|
254
|
+
// If we freed up a slot, set head to it so searching can start there.
|
|
255
|
+
if (new_head != cells.size() && new_head < head) {
|
|
256
|
+
head = new_head;
|
|
257
|
+
}
|
|
253
258
|
} else {
|
|
254
259
|
// match any sequence
|
|
255
|
-
for (uint32_t
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
260
|
+
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
261
|
+
auto & cells = v_cells[s];
|
|
262
|
+
auto & head = v_heads[s];
|
|
259
263
|
|
|
260
|
-
cells.
|
|
264
|
+
uint32_t new_head = cells.size();
|
|
261
265
|
|
|
262
|
-
|
|
263
|
-
|
|
266
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
267
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
cells.rm(i);
|
|
272
|
+
|
|
273
|
+
if (new_head == cells.size()) {
|
|
274
|
+
new_head = i;
|
|
275
|
+
}
|
|
264
276
|
}
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
277
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
278
|
+
// If we freed up a slot, set head to it so searching can start there.
|
|
279
|
+
if (new_head != cells.size() && new_head < head) {
|
|
280
|
+
head = new_head;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
271
283
|
}
|
|
272
284
|
|
|
273
285
|
return true;
|
|
@@ -738,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|
|
738
750
|
}
|
|
739
751
|
|
|
740
752
|
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
|
741
|
-
if (debug > 0) {
|
|
742
|
-
const auto & cells = v_cells[seq_to_stream[1]];
|
|
743
753
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
754
|
+
if (debug > 0) {
|
|
755
|
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
756
|
+
const auto seq_id = ubatch.seq_id_unq[s];
|
|
757
|
+
const auto stream_id = seq_to_stream[seq_id];
|
|
758
|
+
const auto & cells = v_cells[stream_id];
|
|
759
|
+
const uint32_t head_cur = v_heads[stream_id];
|
|
760
|
+
|
|
761
|
+
LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
|
|
762
|
+
__func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
|
|
763
|
+
|
|
764
|
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
|
765
|
+
std::string ss;
|
|
766
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
767
|
+
if (cells.is_empty(i)) {
|
|
768
|
+
ss += '.';
|
|
769
|
+
} else {
|
|
770
|
+
assert(cells.seq_count(i) >= 1);
|
|
748
771
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
772
|
+
if (cells.seq_count(i) == 1) {
|
|
773
|
+
ss += std::to_string(cells.seq_get(i));
|
|
774
|
+
} else {
|
|
775
|
+
ss += 'M';
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
if (i%256 == 255) {
|
|
779
|
+
ss += " *";
|
|
780
|
+
ss += '\n';
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
784
|
+
}
|
|
756
785
|
|
|
757
|
-
|
|
758
|
-
|
|
786
|
+
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
|
787
|
+
std::string ss;
|
|
788
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
789
|
+
std::string cur;
|
|
790
|
+
if (cells.is_empty(i)) {
|
|
791
|
+
cur = '.';
|
|
759
792
|
} else {
|
|
760
|
-
|
|
793
|
+
cur = std::to_string(cells.pos_get(i));
|
|
794
|
+
}
|
|
795
|
+
const int n = cur.size();
|
|
796
|
+
for (int j = 0; j < 5 - n; ++j) {
|
|
797
|
+
cur += ' ';
|
|
798
|
+
}
|
|
799
|
+
ss += cur;
|
|
800
|
+
if (i%256 == 255) {
|
|
801
|
+
ss += " *";
|
|
802
|
+
}
|
|
803
|
+
if (i%64 == 63) {
|
|
804
|
+
ss += '\n';
|
|
761
805
|
}
|
|
762
806
|
}
|
|
763
|
-
|
|
764
|
-
ss += " *";
|
|
765
|
-
ss += '\n';
|
|
766
|
-
}
|
|
807
|
+
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
767
808
|
}
|
|
768
|
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
769
|
-
}
|
|
770
809
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
std::string cur;
|
|
775
|
-
if (cells.is_empty(i)) {
|
|
776
|
-
cur = '.';
|
|
777
|
-
} else {
|
|
778
|
-
cur = std::to_string(cells.pos_get(i));
|
|
779
|
-
}
|
|
780
|
-
const int n = cur.size();
|
|
781
|
-
for (int j = 0; j < 5 - n; ++j) {
|
|
782
|
-
cur += ' ';
|
|
783
|
-
}
|
|
784
|
-
ss += cur;
|
|
785
|
-
if (i%256 == 255) {
|
|
786
|
-
ss += " *";
|
|
787
|
-
}
|
|
788
|
-
if (i%64 == 63) {
|
|
789
|
-
ss += '\n';
|
|
810
|
+
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
|
811
|
+
if (cells.seq_pos_min(s) < 0) {
|
|
812
|
+
continue;
|
|
790
813
|
}
|
|
791
|
-
}
|
|
792
|
-
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
|
793
|
-
}
|
|
794
814
|
|
|
795
|
-
|
|
796
|
-
if (cells.seq_pos_min(s) < 0) {
|
|
797
|
-
continue;
|
|
815
|
+
LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
|
798
816
|
}
|
|
799
|
-
|
|
800
|
-
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
|
801
817
|
}
|
|
802
818
|
}
|
|
803
819
|
|
|
@@ -1812,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
|
1812
1828
|
return false;
|
|
1813
1829
|
}
|
|
1814
1830
|
|
|
1815
|
-
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
1831
|
+
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1832
|
+
GGML_UNUSED(flags);
|
|
1833
|
+
|
|
1816
1834
|
io.write(&n_stream, sizeof(n_stream));
|
|
1817
1835
|
|
|
1818
1836
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1863,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
1863
1881
|
}
|
|
1864
1882
|
}
|
|
1865
1883
|
|
|
1866
|
-
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
1884
|
+
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1885
|
+
GGML_UNUSED(flags);
|
|
1886
|
+
|
|
1867
1887
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
1868
1888
|
|
|
1869
1889
|
uint32_t n_stream_cur;
|
|
@@ -136,8 +136,8 @@ public:
|
|
|
136
136
|
|
|
137
137
|
// state write/load
|
|
138
138
|
|
|
139
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
140
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1)
|
|
139
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
140
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
141
141
|
|
|
142
142
|
//
|
|
143
143
|
// llama_kv_cache_unified specific API
|
|
@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
165
165
|
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
|
166
166
|
}
|
|
167
167
|
|
|
168
|
-
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
168
|
+
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
169
|
+
GGML_UNUSED(flags);
|
|
170
|
+
|
|
169
171
|
mem_attn->state_write(io, seq_id);
|
|
170
172
|
mem_recr->state_write(io, seq_id);
|
|
171
173
|
}
|
|
172
174
|
|
|
173
|
-
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
175
|
+
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
176
|
+
GGML_UNUSED(flags);
|
|
177
|
+
|
|
174
178
|
mem_attn->state_read(io, seq_id);
|
|
175
179
|
mem_recr->state_read(io, seq_id);
|
|
176
180
|
}
|
|
@@ -74,8 +74,8 @@ public:
|
|
|
74
74
|
|
|
75
75
|
// state write/load
|
|
76
76
|
|
|
77
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
78
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
|
77
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
78
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
79
79
|
|
|
80
80
|
//
|
|
81
81
|
// llama_memory_hybrid specific API
|
|
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
|
|
680
680
|
return size_s_bytes;
|
|
681
681
|
}
|
|
682
682
|
|
|
683
|
-
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
683
|
+
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
684
|
+
GGML_UNUSED(flags);
|
|
685
|
+
|
|
684
686
|
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
|
685
687
|
uint32_t cell_count = 0;
|
|
686
688
|
|
|
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
718
720
|
state_write_data(io, cell_ranges);
|
|
719
721
|
}
|
|
720
722
|
|
|
721
|
-
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
723
|
+
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
724
|
+
GGML_UNUSED(flags);
|
|
725
|
+
|
|
722
726
|
uint32_t cell_count;
|
|
723
727
|
io.read_to(&cell_count, sizeof(cell_count));
|
|
724
728
|
|
|
@@ -63,8 +63,8 @@ public:
|
|
|
63
63
|
|
|
64
64
|
// state write/load
|
|
65
65
|
|
|
66
|
-
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
|
67
|
-
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
|
66
|
+
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
|
67
|
+
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
|
68
68
|
|
|
69
69
|
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
|
70
70
|
uint32_t size = 0; // total number of cells, shared across all sequences
|
|
@@ -104,8 +104,8 @@ struct llama_memory_i {
|
|
|
104
104
|
// state write/read
|
|
105
105
|
//
|
|
106
106
|
|
|
107
|
-
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
|
108
|
-
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
|
107
|
+
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
|
108
|
+
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
|
109
109
|
};
|
|
110
110
|
|
|
111
111
|
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
|
@@ -1095,6 +1095,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1095
1095
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1096
1096
|
|
|
1097
1097
|
switch (hparams.n_layer) {
|
|
1098
|
+
case 18: type = LLM_TYPE_537M; break;
|
|
1098
1099
|
case 26: type = LLM_TYPE_1B; break;
|
|
1099
1100
|
case 34: type = LLM_TYPE_4B; break;
|
|
1100
1101
|
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
999
999
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1000
1000
|
|
|
1001
1001
|
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
1002
|
-
#if
|
|
1002
|
+
#if 0
|
|
1003
1003
|
if (new_type == GGML_TYPE_MXFP4) {
|
|
1004
1004
|
auto * x = f32_data_03;
|
|
1005
1005
|
|
|
@@ -2341,7 +2341,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2341
2341
|
|
|
2342
2342
|
// @ngxson : quick hack for gpt-oss, always render these tokens
|
|
2343
2343
|
for (const auto & t : token_to_id) {
|
|
2344
|
-
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
|
2344
|
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
|
2345
2345
|
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2346
2346
|
}
|
|
2347
2347
|
}
|
|
@@ -2388,6 +2388,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2388
2388
|
|
|
2389
2389
|
if (has_return && has_call && has_end) {
|
|
2390
2390
|
special_eog_ids.erase(end_id);
|
|
2391
|
+
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2391
2392
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2392
2393
|
}
|
|
2393
2394
|
}
|