@fugood/llama.node 1.4.13 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +23 -2
- package/lib/index.js +2 -1
- package/lib/index.ts +8 -1
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -12
- package/src/LlamaContext.cpp +16 -4
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +183 -60
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +12 -7
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +88 -369
- package/src/llama.cpp/common/download.h +32 -5
- package/src/llama.cpp/common/preset.cpp +87 -2
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +78 -42
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +225 -101
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +63 -27
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -110,7 +110,7 @@ struct llama_file::impl {
|
|
|
110
110
|
}
|
|
111
111
|
}
|
|
112
112
|
|
|
113
|
-
void read_raw(void * ptr, size_t len)
|
|
113
|
+
void read_raw(void * ptr, size_t len) {
|
|
114
114
|
size_t bytes_read = 0;
|
|
115
115
|
while (bytes_read < len) {
|
|
116
116
|
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
|
@@ -127,7 +127,7 @@ struct llama_file::impl {
|
|
|
127
127
|
}
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
uint32_t read_u32()
|
|
130
|
+
uint32_t read_u32() {
|
|
131
131
|
uint32_t val;
|
|
132
132
|
read_raw(&val, sizeof(val));
|
|
133
133
|
return val;
|
|
@@ -154,8 +154,8 @@ struct llama_file::impl {
|
|
|
154
154
|
write_raw(&val, sizeof(val));
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
-
|
|
158
|
-
|
|
157
|
+
bool has_direct_io() const {
|
|
158
|
+
return true;
|
|
159
159
|
}
|
|
160
160
|
|
|
161
161
|
~impl() {
|
|
@@ -164,33 +164,45 @@ struct llama_file::impl {
|
|
|
164
164
|
}
|
|
165
165
|
}
|
|
166
166
|
#else
|
|
167
|
-
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
|
|
167
|
+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
|
|
168
168
|
#ifdef __linux__
|
|
169
169
|
// Try unbuffered I/O for read only
|
|
170
170
|
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
|
|
171
|
-
|
|
171
|
+
if (init_fd()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
|
|
175
|
+
fname, strerror(errno));
|
|
176
|
+
}
|
|
177
|
+
#endif
|
|
178
|
+
init_fp(mode);
|
|
179
|
+
}
|
|
172
180
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
181
|
+
#ifdef __linux__
|
|
182
|
+
bool init_fd() {
|
|
183
|
+
fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
|
|
176
184
|
|
|
177
|
-
|
|
178
|
-
|
|
185
|
+
if (fd != -1) {
|
|
186
|
+
struct stat file_stats{};
|
|
187
|
+
fstat(fd, &file_stats);
|
|
179
188
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
183
|
-
}
|
|
184
|
-
return;
|
|
185
|
-
}
|
|
189
|
+
size = file_stats.st_size;
|
|
190
|
+
alignment = file_stats.st_blksize;
|
|
186
191
|
|
|
187
|
-
|
|
188
|
-
|
|
192
|
+
off_t ret = lseek(fd, 0, SEEK_SET);
|
|
193
|
+
if (ret == -1) {
|
|
194
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
|
195
|
+
}
|
|
196
|
+
return true;
|
|
189
197
|
}
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
190
200
|
#endif
|
|
191
|
-
|
|
201
|
+
|
|
202
|
+
void init_fp(const char * mode) {
|
|
203
|
+
fp = ggml_fopen(fname.c_str(), mode);
|
|
192
204
|
if (fp == NULL) {
|
|
193
|
-
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
205
|
+
throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
|
|
194
206
|
}
|
|
195
207
|
seek(0, SEEK_END);
|
|
196
208
|
size = tell();
|
|
@@ -226,17 +238,20 @@ struct llama_file::impl {
|
|
|
226
238
|
}
|
|
227
239
|
}
|
|
228
240
|
|
|
229
|
-
void
|
|
241
|
+
void read_raw_unsafe(void * ptr, size_t len) {
|
|
230
242
|
if (len == 0) {
|
|
231
243
|
return;
|
|
232
244
|
}
|
|
233
245
|
errno = 0;
|
|
234
246
|
if (fd == -1) {
|
|
235
|
-
|
|
247
|
+
const size_t curr_off = tell();
|
|
248
|
+
const size_t to_read = std::min(len, size - curr_off);
|
|
249
|
+
|
|
250
|
+
std::size_t ret = std::fread(ptr, to_read, 1, fp);
|
|
236
251
|
if (ferror(fp)) {
|
|
237
252
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
238
253
|
}
|
|
239
|
-
if (ret != 1) {
|
|
254
|
+
if (to_read > 0 && ret != 1) {
|
|
240
255
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
241
256
|
}
|
|
242
257
|
} else {
|
|
@@ -249,6 +264,17 @@ struct llama_file::impl {
|
|
|
249
264
|
if (errno == EINTR) {
|
|
250
265
|
continue; // Interrupted by signal, retry
|
|
251
266
|
}
|
|
267
|
+
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
268
|
+
if (errno == EFAULT) {
|
|
269
|
+
auto curr_off = tell();
|
|
270
|
+
close(fd);
|
|
271
|
+
fd = -1;
|
|
272
|
+
alignment = 1;
|
|
273
|
+
init_fp("rb");
|
|
274
|
+
seek(curr_off, SEEK_SET);
|
|
275
|
+
read_raw_unsafe(ptr, len);
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
252
278
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
253
279
|
}
|
|
254
280
|
if (ret == 0) {
|
|
@@ -266,7 +292,8 @@ struct llama_file::impl {
|
|
|
266
292
|
}
|
|
267
293
|
}
|
|
268
294
|
|
|
269
|
-
void read_aligned_chunk(
|
|
295
|
+
void read_aligned_chunk(void * dest, size_t size) {
|
|
296
|
+
size_t offset = tell();
|
|
270
297
|
off_t aligned_offset = offset & ~(alignment - 1);
|
|
271
298
|
off_t offset_from_alignment = offset - aligned_offset;
|
|
272
299
|
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
|
|
@@ -283,13 +310,21 @@ struct llama_file::impl {
|
|
|
283
310
|
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
|
|
284
311
|
|
|
285
312
|
seek(aligned_offset, SEEK_SET);
|
|
286
|
-
|
|
313
|
+
read_raw_unsafe(buffer.get(), bytes_to_read);
|
|
287
314
|
|
|
288
315
|
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
|
|
289
316
|
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
|
|
290
317
|
}
|
|
291
318
|
|
|
292
|
-
|
|
319
|
+
void read_raw(void * ptr, size_t len) {
|
|
320
|
+
if (has_direct_io()) {
|
|
321
|
+
read_aligned_chunk(ptr, len);
|
|
322
|
+
} else {
|
|
323
|
+
read_raw_unsafe(ptr, len);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
uint32_t read_u32() {
|
|
293
328
|
uint32_t ret;
|
|
294
329
|
read_raw(&ret, sizeof(ret));
|
|
295
330
|
return ret;
|
|
@@ -310,6 +345,10 @@ struct llama_file::impl {
|
|
|
310
345
|
write_raw(&val, sizeof(val));
|
|
311
346
|
}
|
|
312
347
|
|
|
348
|
+
bool has_direct_io() const {
|
|
349
|
+
return fd != -1 && alignment > 1;
|
|
350
|
+
}
|
|
351
|
+
|
|
313
352
|
~impl() {
|
|
314
353
|
if (fd != -1) {
|
|
315
354
|
close(fd);
|
|
@@ -318,17 +357,9 @@ struct llama_file::impl {
|
|
|
318
357
|
}
|
|
319
358
|
}
|
|
320
359
|
int fd = -1;
|
|
360
|
+
std::string fname;
|
|
321
361
|
#endif
|
|
322
362
|
|
|
323
|
-
void read_raw_at(void * ptr, size_t len, size_t offset) const {
|
|
324
|
-
if (alignment != 1) {
|
|
325
|
-
read_aligned_chunk(offset, ptr, len);
|
|
326
|
-
} else {
|
|
327
|
-
seek(offset, SEEK_SET);
|
|
328
|
-
read_raw(ptr, len);
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
363
|
size_t read_alignment() const {
|
|
333
364
|
return alignment;
|
|
334
365
|
}
|
|
@@ -347,6 +378,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
|
|
|
347
378
|
size_t llama_file::size() const { return pimpl->size; }
|
|
348
379
|
|
|
349
380
|
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
|
|
381
|
+
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
|
|
350
382
|
|
|
351
383
|
int llama_file::file_id() const {
|
|
352
384
|
#ifdef _WIN32
|
|
@@ -361,10 +393,14 @@ int llama_file::file_id() const {
|
|
|
361
393
|
}
|
|
362
394
|
|
|
363
395
|
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
|
364
|
-
void llama_file::read_raw(void * ptr, size_t len)
|
|
365
|
-
|
|
396
|
+
void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
397
|
+
#ifdef _WIN32
|
|
398
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
|
|
399
|
+
#else
|
|
400
|
+
void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
|
|
401
|
+
#endif
|
|
366
402
|
|
|
367
|
-
uint32_t llama_file::read_u32()
|
|
403
|
+
uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
|
|
368
404
|
|
|
369
405
|
void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
|
|
370
406
|
void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
|
|
@@ -578,9 +614,9 @@ struct llama_mlock::impl {
|
|
|
578
614
|
|
|
579
615
|
char* errmsg = std::strerror(errno);
|
|
580
616
|
bool suggest = (errno == ENOMEM);
|
|
581
|
-
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
|
|
582
|
-
// visionOS/tvOS
|
|
583
|
-
// Skip resource limit checks on
|
|
617
|
+
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
|
|
618
|
+
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
|
|
619
|
+
// Skip resource limit checks on these platforms
|
|
584
620
|
suggest = false;
|
|
585
621
|
#else
|
|
586
622
|
struct rlimit lock_limit;
|
|
@@ -24,15 +24,16 @@ struct llama_file {
|
|
|
24
24
|
|
|
25
25
|
void seek(size_t offset, int whence) const;
|
|
26
26
|
|
|
27
|
-
void read_raw(void * ptr, size_t len)
|
|
28
|
-
void
|
|
29
|
-
void read_aligned_chunk(
|
|
30
|
-
uint32_t read_u32()
|
|
27
|
+
void read_raw(void * ptr, size_t len);
|
|
28
|
+
void read_raw_unsafe(void * ptr, size_t len);
|
|
29
|
+
void read_aligned_chunk(void * dest, size_t size);
|
|
30
|
+
uint32_t read_u32();
|
|
31
31
|
|
|
32
32
|
void write_raw(const void * ptr, size_t len) const;
|
|
33
33
|
void write_u32(uint32_t val) const;
|
|
34
34
|
|
|
35
35
|
size_t read_alignment() const;
|
|
36
|
+
bool has_direct_io() const;
|
|
36
37
|
private:
|
|
37
38
|
struct impl;
|
|
38
39
|
std::unique_ptr<impl> pimpl;
|
|
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
|
|
|
495
495
|
const std::string & fname,
|
|
496
496
|
std::vector<std::string> & splits,
|
|
497
497
|
bool use_mmap,
|
|
498
|
+
bool use_direct_io,
|
|
498
499
|
bool check_tensors,
|
|
499
500
|
bool no_alloc,
|
|
500
501
|
const llama_model_kv_override * param_overrides_p,
|
|
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
|
|
|
527
528
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
528
529
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
|
529
530
|
|
|
530
|
-
files.emplace_back(new llama_file(fname.c_str(), "rb",
|
|
531
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
531
532
|
contexts.emplace_back(ctx);
|
|
532
533
|
|
|
534
|
+
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
|
535
|
+
|
|
536
|
+
// Disable mmap in case Direct I/O is enabled and available
|
|
537
|
+
if (use_direct_io && use_mmap) {
|
|
538
|
+
use_mmap = false;
|
|
539
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
540
|
+
}
|
|
541
|
+
|
|
533
542
|
// Save tensors data offset of the main file.
|
|
534
543
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
|
535
544
|
// so we build a unified tensors index for weights.
|
|
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
|
|
|
595
604
|
}
|
|
596
605
|
}
|
|
597
606
|
|
|
598
|
-
files.emplace_back(new llama_file(fname_split, "rb",
|
|
607
|
+
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
|
599
608
|
contexts.emplace_back(ctx);
|
|
600
609
|
|
|
601
610
|
// Save tensors data offset info of the shard.
|
|
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
|
|
|
739
748
|
}
|
|
740
749
|
|
|
741
750
|
this->use_mmap = use_mmap;
|
|
751
|
+
this->use_direct_io = use_direct_io;
|
|
742
752
|
this->check_tensors = check_tensors;
|
|
743
753
|
this->no_alloc = no_alloc;
|
|
744
754
|
}
|
|
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1100
1110
|
const auto & file = files.at(weight->idx);
|
|
1101
1111
|
|
|
1102
1112
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
1103
|
-
file->
|
|
1113
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1114
|
+
file->read_raw(cur->data, n_size);
|
|
1104
1115
|
if (check_tensors) {
|
|
1105
1116
|
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
|
|
1106
1117
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
|
|
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
|
|
|
1132
1143
|
ggml_backend_event_synchronize(events[buffer_idx]);
|
|
1133
1144
|
|
|
1134
1145
|
// Read aligned chunk from file
|
|
1135
|
-
file->
|
|
1146
|
+
file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
|
|
1136
1147
|
|
|
1137
1148
|
// Calculate actual data portion (excluding alignment padding)
|
|
1138
1149
|
uintptr_t ptr_data = ptr_dest_aligned;
|
|
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
|
|
|
1162
1173
|
}
|
|
1163
1174
|
} else {
|
|
1164
1175
|
read_buf.resize(n_size);
|
|
1165
|
-
file->
|
|
1176
|
+
file->seek(weight->offs, SEEK_SET);
|
|
1177
|
+
file->read_raw(read_buf.data(), n_size);
|
|
1166
1178
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
|
1167
1179
|
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
|
1168
1180
|
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
|
@@ -70,6 +70,7 @@ struct llama_model_loader {
|
|
|
70
70
|
size_t n_bytes = 0;
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
|
+
bool use_direct_io = false;
|
|
73
74
|
bool check_tensors;
|
|
74
75
|
bool no_alloc;
|
|
75
76
|
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
const std::string & fname,
|
|
98
99
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
99
100
|
bool use_mmap,
|
|
101
|
+
bool use_direct_io,
|
|
100
102
|
bool check_tensors,
|
|
101
103
|
bool no_alloc,
|
|
102
104
|
const llama_model_kv_override * param_overrides_p,
|