@fugood/llama.node 1.4.13 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/lib/binding.ts +23 -2
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +8 -1
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -12
  7. package/src/LlamaContext.cpp +16 -4
  8. package/src/llama.cpp/CMakeLists.txt +24 -8
  9. package/src/llama.cpp/common/CMakeLists.txt +3 -34
  10. package/src/llama.cpp/common/arg.cpp +183 -60
  11. package/src/llama.cpp/common/arg.h +0 -8
  12. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  13. package/src/llama.cpp/common/chat.cpp +67 -0
  14. package/src/llama.cpp/common/chat.h +1 -0
  15. package/src/llama.cpp/common/common.cpp +2 -1
  16. package/src/llama.cpp/common/common.h +12 -7
  17. package/src/llama.cpp/common/debug.cpp +165 -0
  18. package/src/llama.cpp/common/debug.h +43 -0
  19. package/src/llama.cpp/common/download.cpp +88 -369
  20. package/src/llama.cpp/common/download.h +32 -5
  21. package/src/llama.cpp/common/preset.cpp +87 -2
  22. package/src/llama.cpp/common/preset.h +10 -1
  23. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  24. package/src/llama.cpp/include/llama.h +5 -2
  25. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  27. package/src/llama.cpp/src/llama-arch.h +1 -0
  28. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  29. package/src/llama.cpp/src/llama-chat.h +1 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  31. package/src/llama.cpp/src/llama-mmap.cpp +78 -42
  32. package/src/llama.cpp/src/llama-mmap.h +5 -4
  33. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  34. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  35. package/src/llama.cpp/src/llama-model.cpp +225 -101
  36. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  37. package/src/llama.cpp/src/llama-sampling.cpp +1 -1
  38. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  39. package/src/llama.cpp/src/llama-vocab.h +1 -0
  40. package/src/llama.cpp/src/llama.cpp +63 -27
  41. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  42. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  43. package/src/llama.cpp/src/models/models.h +13 -2
  44. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -110,7 +110,7 @@ struct llama_file::impl {
110
110
  }
111
111
  }
112
112
 
113
- void read_raw(void * ptr, size_t len) const {
113
+ void read_raw(void * ptr, size_t len) {
114
114
  size_t bytes_read = 0;
115
115
  while (bytes_read < len) {
116
116
  size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +127,7 @@ struct llama_file::impl {
127
127
  }
128
128
  }
129
129
 
130
- uint32_t read_u32() const {
130
+ uint32_t read_u32() {
131
131
  uint32_t val;
132
132
  read_raw(&val, sizeof(val));
133
133
  return val;
@@ -154,8 +154,8 @@ struct llama_file::impl {
154
154
  write_raw(&val, sizeof(val));
155
155
  }
156
156
 
157
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158
- throw std::runtime_error("DirectIO is not implemented on Windows.");
157
+ bool has_direct_io() const {
158
+ return true;
159
159
  }
160
160
 
161
161
  ~impl() {
@@ -164,33 +164,45 @@ struct llama_file::impl {
164
164
  }
165
165
  }
166
166
  #else
167
- impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
167
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
168
168
  #ifdef __linux__
169
169
  // Try unbuffered I/O for read only
170
170
  if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171
- fd = open(fname, O_RDONLY | O_DIRECT);
171
+ if (init_fd()) {
172
+ return;
173
+ }
174
+ LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
175
+ fname, strerror(errno));
176
+ }
177
+ #endif
178
+ init_fp(mode);
179
+ }
172
180
 
173
- if (fd != -1) {
174
- struct stat file_stats{};
175
- fstat(fd, &file_stats);
181
+ #ifdef __linux__
182
+ bool init_fd() {
183
+ fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
176
184
 
177
- size = file_stats.st_size;
178
- alignment = file_stats.st_blksize;
185
+ if (fd != -1) {
186
+ struct stat file_stats{};
187
+ fstat(fd, &file_stats);
179
188
 
180
- off_t ret = lseek(fd, 0, SEEK_SET);
181
- if (ret == -1) {
182
- throw std::runtime_error(format("seek error: %s", strerror(errno)));
183
- }
184
- return;
185
- }
189
+ size = file_stats.st_size;
190
+ alignment = file_stats.st_blksize;
186
191
 
187
- LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188
- fname, strerror(errno));
192
+ off_t ret = lseek(fd, 0, SEEK_SET);
193
+ if (ret == -1) {
194
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
195
+ }
196
+ return true;
189
197
  }
198
+ return false;
199
+ }
190
200
  #endif
191
- fp = ggml_fopen(fname, mode);
201
+
202
+ void init_fp(const char * mode) {
203
+ fp = ggml_fopen(fname.c_str(), mode);
192
204
  if (fp == NULL) {
193
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
205
+ throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
194
206
  }
195
207
  seek(0, SEEK_END);
196
208
  size = tell();
@@ -226,17 +238,20 @@ struct llama_file::impl {
226
238
  }
227
239
  }
228
240
 
229
- void read_raw(void * ptr, size_t len) const {
241
+ void read_raw_unsafe(void * ptr, size_t len) {
230
242
  if (len == 0) {
231
243
  return;
232
244
  }
233
245
  errno = 0;
234
246
  if (fd == -1) {
235
- std::size_t ret = std::fread(ptr, len, 1, fp);
247
+ const size_t curr_off = tell();
248
+ const size_t to_read = std::min(len, size - curr_off);
249
+
250
+ std::size_t ret = std::fread(ptr, to_read, 1, fp);
236
251
  if (ferror(fp)) {
237
252
  throw std::runtime_error(format("read error: %s", strerror(errno)));
238
253
  }
239
- if (ret != 1) {
254
+ if (to_read > 0 && ret != 1) {
240
255
  throw std::runtime_error("unexpectedly reached end of file");
241
256
  }
242
257
  } else {
@@ -249,6 +264,17 @@ struct llama_file::impl {
249
264
  if (errno == EINTR) {
250
265
  continue; // Interrupted by signal, retry
251
266
  }
267
+ // Fallback to std::fread in case the DMA controller cannot access the buffer
268
+ if (errno == EFAULT) {
269
+ auto curr_off = tell();
270
+ close(fd);
271
+ fd = -1;
272
+ alignment = 1;
273
+ init_fp("rb");
274
+ seek(curr_off, SEEK_SET);
275
+ read_raw_unsafe(ptr, len);
276
+ return;
277
+ }
252
278
  throw std::runtime_error(format("read error: %s", strerror(errno)));
253
279
  }
254
280
  if (ret == 0) {
@@ -266,7 +292,8 @@ struct llama_file::impl {
266
292
  }
267
293
  }
268
294
 
269
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
295
+ void read_aligned_chunk(void * dest, size_t size) {
296
+ size_t offset = tell();
270
297
  off_t aligned_offset = offset & ~(alignment - 1);
271
298
  off_t offset_from_alignment = offset - aligned_offset;
272
299
  size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@@ -283,13 +310,21 @@ struct llama_file::impl {
283
310
  std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
284
311
 
285
312
  seek(aligned_offset, SEEK_SET);
286
- read_raw(buffer.get(), bytes_to_read);
313
+ read_raw_unsafe(buffer.get(), bytes_to_read);
287
314
 
288
315
  uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
289
316
  memcpy(dest, reinterpret_cast<void *>(actual_data), size);
290
317
  }
291
318
 
292
- uint32_t read_u32() const {
319
+ void read_raw(void * ptr, size_t len) {
320
+ if (has_direct_io()) {
321
+ read_aligned_chunk(ptr, len);
322
+ } else {
323
+ read_raw_unsafe(ptr, len);
324
+ }
325
+ }
326
+
327
+ uint32_t read_u32() {
293
328
  uint32_t ret;
294
329
  read_raw(&ret, sizeof(ret));
295
330
  return ret;
@@ -310,6 +345,10 @@ struct llama_file::impl {
310
345
  write_raw(&val, sizeof(val));
311
346
  }
312
347
 
348
+ bool has_direct_io() const {
349
+ return fd != -1 && alignment > 1;
350
+ }
351
+
313
352
  ~impl() {
314
353
  if (fd != -1) {
315
354
  close(fd);
@@ -318,17 +357,9 @@ struct llama_file::impl {
318
357
  }
319
358
  }
320
359
  int fd = -1;
360
+ std::string fname;
321
361
  #endif
322
362
 
323
- void read_raw_at(void * ptr, size_t len, size_t offset) const {
324
- if (alignment != 1) {
325
- read_aligned_chunk(offset, ptr, len);
326
- } else {
327
- seek(offset, SEEK_SET);
328
- read_raw(ptr, len);
329
- }
330
- }
331
-
332
363
  size_t read_alignment() const {
333
364
  return alignment;
334
365
  }
@@ -347,6 +378,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
347
378
  size_t llama_file::size() const { return pimpl->size; }
348
379
 
349
380
  size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
381
+ bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
350
382
 
351
383
  int llama_file::file_id() const {
352
384
  #ifdef _WIN32
@@ -361,10 +393,14 @@ int llama_file::file_id() const {
361
393
  }
362
394
 
363
395
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
364
- void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
365
- void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
396
+ void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
397
+ #ifdef _WIN32
398
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
399
+ #else
400
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
401
+ #endif
366
402
 
367
- uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
403
+ uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
368
404
 
369
405
  void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
370
406
  void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -578,9 +614,9 @@ struct llama_mlock::impl {
578
614
 
579
615
  char* errmsg = std::strerror(errno);
580
616
  bool suggest = (errno == ENOMEM);
581
- #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
582
- // visionOS/tvOS dont't support RLIMIT_MEMLOCK
583
- // Skip resource limit checks on visionOS/tvOS
617
+ #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
618
+ // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
619
+ // Skip resource limit checks on these platforms
584
620
  suggest = false;
585
621
  #else
586
622
  struct rlimit lock_limit;
@@ -24,15 +24,16 @@ struct llama_file {
24
24
 
25
25
  void seek(size_t offset, int whence) const;
26
26
 
27
- void read_raw(void * ptr, size_t len) const;
28
- void read_raw_at(void * ptr, size_t len, size_t offset) const;
29
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
30
- uint32_t read_u32() const;
27
+ void read_raw(void * ptr, size_t len);
28
+ void read_raw_unsafe(void * ptr, size_t len);
29
+ void read_aligned_chunk(void * dest, size_t size);
30
+ uint32_t read_u32();
31
31
 
32
32
  void write_raw(const void * ptr, size_t len) const;
33
33
  void write_u32(uint32_t val) const;
34
34
 
35
35
  size_t read_alignment() const;
36
+ bool has_direct_io() const;
36
37
  private:
37
38
  struct impl;
38
39
  std::unique_ptr<impl> pimpl;
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
495
495
  const std::string & fname,
496
496
  std::vector<std::string> & splits,
497
497
  bool use_mmap,
498
+ bool use_direct_io,
498
499
  bool check_tensors,
499
500
  bool no_alloc,
500
501
  const llama_model_kv_override * param_overrides_p,
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
527
528
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
528
529
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
529
530
 
530
- files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
531
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
531
532
  contexts.emplace_back(ctx);
532
533
 
534
+ use_direct_io = use_direct_io && files.back()->has_direct_io();
535
+
536
+ // Disable mmap in case Direct I/O is enabled and available
537
+ if (use_direct_io && use_mmap) {
538
+ use_mmap = false;
539
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
540
+ }
541
+
533
542
  // Save tensors data offset of the main file.
534
543
  // For subsidiary files, `meta` tensor data offset must not be used,
535
544
  // so we build a unified tensors index for weights.
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
595
604
  }
596
605
  }
597
606
 
598
- files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
607
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
599
608
  contexts.emplace_back(ctx);
600
609
 
601
610
  // Save tensors data offset info of the shard.
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
739
748
  }
740
749
 
741
750
  this->use_mmap = use_mmap;
751
+ this->use_direct_io = use_direct_io;
742
752
  this->check_tensors = check_tensors;
743
753
  this->no_alloc = no_alloc;
744
754
  }
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
1100
1110
  const auto & file = files.at(weight->idx);
1101
1111
 
1102
1112
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1103
- file->read_raw_at(cur->data, n_size, weight->offs);
1113
+ file->seek(weight->offs, SEEK_SET);
1114
+ file->read_raw(cur->data, n_size);
1104
1115
  if (check_tensors) {
1105
1116
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1106
1117
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
1132
1143
  ggml_backend_event_synchronize(events[buffer_idx]);
1133
1144
 
1134
1145
  // Read aligned chunk from file
1135
- file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1146
+ file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1136
1147
 
1137
1148
  // Calculate actual data portion (excluding alignment padding)
1138
1149
  uintptr_t ptr_data = ptr_dest_aligned;
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
1162
1173
  }
1163
1174
  } else {
1164
1175
  read_buf.resize(n_size);
1165
- file->read_raw_at(read_buf.data(), n_size, weight->offs);
1176
+ file->seek(weight->offs, SEEK_SET);
1177
+ file->read_raw(read_buf.data(), n_size);
1166
1178
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1167
1179
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1168
1180
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -70,6 +70,7 @@ struct llama_model_loader {
70
70
  size_t n_bytes = 0;
71
71
 
72
72
  bool use_mmap = false;
73
+ bool use_direct_io = false;
73
74
  bool check_tensors;
74
75
  bool no_alloc;
75
76
 
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  const std::string & fname,
98
99
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
99
100
  bool use_mmap,
101
+ bool use_direct_io,
100
102
  bool check_tensors,
101
103
  bool no_alloc,
102
104
  const llama_model_kv_override * param_overrides_p,