@fugood/llama.node 1.4.12 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +11 -1
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +2 -0
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -9
  7. package/src/LlamaContext.cpp +5 -2
  8. package/src/llama.cpp/common/arg.cpp +249 -101
  9. package/src/llama.cpp/common/arg.h +0 -8
  10. package/src/llama.cpp/common/chat.cpp +4 -4
  11. package/src/llama.cpp/common/common.cpp +21 -1
  12. package/src/llama.cpp/common/common.h +20 -7
  13. package/src/llama.cpp/common/download.cpp +104 -55
  14. package/src/llama.cpp/common/download.h +26 -5
  15. package/src/llama.cpp/common/llguidance.cpp +10 -6
  16. package/src/llama.cpp/common/preset.cpp +76 -1
  17. package/src/llama.cpp/common/preset.h +10 -1
  18. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  19. package/src/llama.cpp/common/sampling.cpp +58 -14
  20. package/src/llama.cpp/common/sampling.h +3 -1
  21. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  22. package/src/llama.cpp/include/llama.h +92 -10
  23. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  24. package/src/llama.cpp/src/llama-arch.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +615 -28
  26. package/src/llama.cpp/src/llama-context.h +43 -1
  27. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  28. package/src/llama.cpp/src/llama-grammar.h +2 -0
  29. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  30. package/src/llama.cpp/src/llama-graph.h +71 -6
  31. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  32. package/src/llama.cpp/src/llama-hparams.h +8 -2
  33. package/src/llama.cpp/src/llama-mmap.cpp +70 -37
  34. package/src/llama.cpp/src/llama-mmap.h +5 -4
  35. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  36. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  37. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  38. package/src/llama.cpp/src/llama-model.cpp +66 -16
  39. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  40. package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
  41. package/src/llama.cpp/src/llama-sampling.h +16 -7
  42. package/src/llama.cpp/src/llama.cpp +101 -57
  43. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  44. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  45. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  46. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  47. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  48. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  49. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -110,7 +110,7 @@ struct llama_file::impl {
110
110
  }
111
111
  }
112
112
 
113
- void read_raw(void * ptr, size_t len) const {
113
+ void read_raw(void * ptr, size_t len) {
114
114
  size_t bytes_read = 0;
115
115
  while (bytes_read < len) {
116
116
  size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +127,7 @@ struct llama_file::impl {
127
127
  }
128
128
  }
129
129
 
130
- uint32_t read_u32() const {
130
+ uint32_t read_u32() {
131
131
  uint32_t val;
132
132
  read_raw(&val, sizeof(val));
133
133
  return val;
@@ -154,8 +154,8 @@ struct llama_file::impl {
154
154
  write_raw(&val, sizeof(val));
155
155
  }
156
156
 
157
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158
- throw std::runtime_error("DirectIO is not implemented on Windows.");
157
+ bool has_direct_io() const {
158
+ return true;
159
159
  }
160
160
 
161
161
  ~impl() {
@@ -164,33 +164,45 @@ struct llama_file::impl {
164
164
  }
165
165
  }
166
166
  #else
167
- impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
167
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
168
168
  #ifdef __linux__
169
169
  // Try unbuffered I/O for read only
170
170
  if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171
- fd = open(fname, O_RDONLY | O_DIRECT);
171
+ if (init_fd()) {
172
+ return;
173
+ }
174
+ LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
175
+ fname, strerror(errno));
176
+ }
177
+ #endif
178
+ init_fp(mode);
179
+ }
172
180
 
173
- if (fd != -1) {
174
- struct stat file_stats{};
175
- fstat(fd, &file_stats);
181
+ #ifdef __linux__
182
+ bool init_fd() {
183
+ fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
176
184
 
177
- size = file_stats.st_size;
178
- alignment = file_stats.st_blksize;
185
+ if (fd != -1) {
186
+ struct stat file_stats{};
187
+ fstat(fd, &file_stats);
179
188
 
180
- off_t ret = lseek(fd, 0, SEEK_SET);
181
- if (ret == -1) {
182
- throw std::runtime_error(format("seek error: %s", strerror(errno)));
183
- }
184
- return;
185
- }
189
+ size = file_stats.st_size;
190
+ alignment = file_stats.st_blksize;
186
191
 
187
- LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188
- fname, strerror(errno));
192
+ off_t ret = lseek(fd, 0, SEEK_SET);
193
+ if (ret == -1) {
194
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
195
+ }
196
+ return true;
189
197
  }
198
+ return false;
199
+ }
190
200
  #endif
191
- fp = ggml_fopen(fname, mode);
201
+
202
+ void init_fp(const char * mode) {
203
+ fp = ggml_fopen(fname.c_str(), mode);
192
204
  if (fp == NULL) {
193
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
205
+ throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
194
206
  }
195
207
  seek(0, SEEK_END);
196
208
  size = tell();
@@ -226,7 +238,7 @@ struct llama_file::impl {
226
238
  }
227
239
  }
228
240
 
229
- void read_raw(void * ptr, size_t len) const {
241
+ void read_raw_unsafe(void * ptr, size_t len) {
230
242
  if (len == 0) {
231
243
  return;
232
244
  }
@@ -249,6 +261,17 @@ struct llama_file::impl {
249
261
  if (errno == EINTR) {
250
262
  continue; // Interrupted by signal, retry
251
263
  }
264
+ // Fallback to std::fread in case the DMA controller cannot access the buffer
265
+ if (errno == EFAULT) {
266
+ auto curr_off = tell();
267
+ close(fd);
268
+ fd = -1;
269
+ alignment = 1;
270
+ init_fp("rb");
271
+ seek(curr_off, SEEK_SET);
272
+ read_raw_unsafe(ptr, len);
273
+ return;
274
+ }
252
275
  throw std::runtime_error(format("read error: %s", strerror(errno)));
253
276
  }
254
277
  if (ret == 0) {
@@ -266,7 +289,8 @@ struct llama_file::impl {
266
289
  }
267
290
  }
268
291
 
269
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
292
+ void read_aligned_chunk(void * dest, size_t size) {
293
+ size_t offset = tell();
270
294
  off_t aligned_offset = offset & ~(alignment - 1);
271
295
  off_t offset_from_alignment = offset - aligned_offset;
272
296
  size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@@ -283,13 +307,21 @@ struct llama_file::impl {
283
307
  std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
284
308
 
285
309
  seek(aligned_offset, SEEK_SET);
286
- read_raw(buffer.get(), bytes_to_read);
310
+ read_raw_unsafe(buffer.get(), bytes_to_read);
287
311
 
288
312
  uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
289
313
  memcpy(dest, reinterpret_cast<void *>(actual_data), size);
290
314
  }
291
315
 
292
- uint32_t read_u32() const {
316
+ void read_raw(void * ptr, size_t len) {
317
+ if (has_direct_io()) {
318
+ read_aligned_chunk(ptr, len);
319
+ } else {
320
+ read_raw_unsafe(ptr, len);
321
+ }
322
+ }
323
+
324
+ uint32_t read_u32() {
293
325
  uint32_t ret;
294
326
  read_raw(&ret, sizeof(ret));
295
327
  return ret;
@@ -310,6 +342,10 @@ struct llama_file::impl {
310
342
  write_raw(&val, sizeof(val));
311
343
  }
312
344
 
345
+ bool has_direct_io() const {
346
+ return fd != -1 && alignment > 1;
347
+ }
348
+
313
349
  ~impl() {
314
350
  if (fd != -1) {
315
351
  close(fd);
@@ -318,17 +354,9 @@ struct llama_file::impl {
318
354
  }
319
355
  }
320
356
  int fd = -1;
357
+ std::string fname;
321
358
  #endif
322
359
 
323
- void read_raw_at(void * ptr, size_t len, size_t offset) const {
324
- if (alignment != 1) {
325
- read_aligned_chunk(offset, ptr, len);
326
- } else {
327
- seek(offset, SEEK_SET);
328
- read_raw(ptr, len);
329
- }
330
- }
331
-
332
360
  size_t read_alignment() const {
333
361
  return alignment;
334
362
  }
@@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
347
375
  size_t llama_file::size() const { return pimpl->size; }
348
376
 
349
377
  size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
378
+ bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
350
379
 
351
380
  int llama_file::file_id() const {
352
381
  #ifdef _WIN32
@@ -361,10 +390,14 @@ int llama_file::file_id() const {
361
390
  }
362
391
 
363
392
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
364
- void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
365
- void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
393
+ void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
394
+ #ifdef _WIN32
395
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
396
+ #else
397
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
398
+ #endif
366
399
 
367
- uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
400
+ uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
368
401
 
369
402
  void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
370
403
  void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -24,15 +24,16 @@ struct llama_file {
24
24
 
25
25
  void seek(size_t offset, int whence) const;
26
26
 
27
- void read_raw(void * ptr, size_t len) const;
28
- void read_raw_at(void * ptr, size_t len, size_t offset) const;
29
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
30
- uint32_t read_u32() const;
27
+ void read_raw(void * ptr, size_t len);
28
+ void read_raw_unsafe(void * ptr, size_t len);
29
+ void read_aligned_chunk(void * dest, size_t size);
30
+ uint32_t read_u32();
31
31
 
32
32
  void write_raw(const void * ptr, size_t len) const;
33
33
  void write_u32(uint32_t val) const;
34
34
 
35
35
  size_t read_alignment() const;
36
+ bool has_direct_io() const;
36
37
  private:
37
38
  struct impl;
38
39
  std::unique_ptr<impl> pimpl;
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
495
495
  const std::string & fname,
496
496
  std::vector<std::string> & splits,
497
497
  bool use_mmap,
498
+ bool use_direct_io,
498
499
  bool check_tensors,
499
500
  bool no_alloc,
500
501
  const llama_model_kv_override * param_overrides_p,
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
527
528
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
528
529
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
529
530
 
530
- files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
531
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
531
532
  contexts.emplace_back(ctx);
532
533
 
534
+ use_direct_io = use_direct_io && files.back()->has_direct_io();
535
+
536
+ // Disable mmap in case Direct I/O is enabled and available
537
+ if (use_direct_io && use_mmap) {
538
+ use_mmap = false;
539
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
540
+ }
541
+
533
542
  // Save tensors data offset of the main file.
534
543
  // For subsidiary files, `meta` tensor data offset must not be used,
535
544
  // so we build a unified tensors index for weights.
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
595
604
  }
596
605
  }
597
606
 
598
- files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
607
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
599
608
  contexts.emplace_back(ctx);
600
609
 
601
610
  // Save tensors data offset info of the shard.
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
739
748
  }
740
749
 
741
750
  this->use_mmap = use_mmap;
751
+ this->use_direct_io = use_direct_io;
742
752
  this->check_tensors = check_tensors;
743
753
  this->no_alloc = no_alloc;
744
754
  }
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
1100
1110
  const auto & file = files.at(weight->idx);
1101
1111
 
1102
1112
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1103
- file->read_raw_at(cur->data, n_size, weight->offs);
1113
+ file->seek(weight->offs, SEEK_SET);
1114
+ file->read_raw(cur->data, n_size);
1104
1115
  if (check_tensors) {
1105
1116
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1106
1117
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
1132
1143
  ggml_backend_event_synchronize(events[buffer_idx]);
1133
1144
 
1134
1145
  // Read aligned chunk from file
1135
- file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1146
+ file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1136
1147
 
1137
1148
  // Calculate actual data portion (excluding alignment padding)
1138
1149
  uintptr_t ptr_data = ptr_dest_aligned;
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
1162
1173
  }
1163
1174
  } else {
1164
1175
  read_buf.resize(n_size);
1165
- file->read_raw_at(read_buf.data(), n_size, weight->offs);
1176
+ file->seek(weight->offs, SEEK_SET);
1177
+ file->read_raw(read_buf.data(), n_size);
1166
1178
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1167
1179
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1168
1180
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -70,6 +70,7 @@ struct llama_model_loader {
70
70
  size_t n_bytes = 0;
71
71
 
72
72
  bool use_mmap = false;
73
+ bool use_direct_io = false;
73
74
  bool check_tensors;
74
75
  bool no_alloc;
75
76
 
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  const std::string & fname,
98
99
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
99
100
  bool use_mmap,
101
+ bool use_direct_io,
100
102
  bool check_tensors,
101
103
  bool no_alloc,
102
104
  const llama_model_kv_override * param_overrides_p,
@@ -146,6 +146,9 @@ void llama_model_saver::add_kv_from_model() {
146
146
  add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
147
147
  add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
148
148
  add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
149
+ if (hparams.n_embd_out > 0) {
150
+ add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
151
+ }
149
152
  add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
150
153
  add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
151
154
  add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
@@ -507,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
507
507
 
508
508
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
509
509
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
510
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
510
511
  ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
511
512
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
512
513
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -578,6 +579,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
578
579
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
579
580
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
580
581
 
582
+ // TODO: Handle SWA metadata similarly when models start implementing it
581
583
  // rope_freq_scale (inverse of the kv) is optional
582
584
  float ropescale = 0.0f;
583
585
  if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -586,10 +588,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
586
588
  }
587
589
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
588
590
 
589
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
590
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
591
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
592
-
593
591
  ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
594
592
 
595
593
  // non-transformer models do not have attention heads
@@ -677,6 +675,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
677
675
  hparams.f_attn_temp_scale = 0.1f;
678
676
  hparams.f_attn_temp_offset = 1.0f;
679
677
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
678
+
679
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
680
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
681
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
680
682
  }
681
683
 
682
684
  switch (hparams.n_expert) {
@@ -722,6 +724,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
722
724
  if (hparams.n_swa > 0) {
723
725
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
724
726
  hparams.set_swa_pattern(4);
727
+
728
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
729
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
730
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
725
731
  } else {
726
732
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
727
733
  }
@@ -1243,7 +1249,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1243
1249
  if (found_swa && hparams.n_swa > 0) {
1244
1250
  uint32_t swa_period = 8;
1245
1251
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1246
- hparams.rope_freq_scale_train_swa = 1.0f;
1247
1252
  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1248
1253
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1249
1254
  hparams.set_swa_pattern(swa_period);
@@ -1309,7 +1314,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1309
1314
  hparams.n_swa = 4096; // default value of gemma 2
1310
1315
  hparams.set_swa_pattern(2);
1311
1316
  hparams.attn_soft_cap = true;
1317
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1318
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1312
1319
 
1320
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1313
1321
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1314
1322
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1315
1323
  ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@@ -1334,8 +1342,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1334
1342
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1335
1343
  hparams.set_swa_pattern(6);
1336
1344
 
1337
- hparams.rope_freq_base_train_swa = 10000.0f;
1338
- hparams.rope_freq_scale_train_swa = 1.0f;
1345
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1339
1346
  } else {
1340
1347
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1341
1348
  }
@@ -1365,10 +1372,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1365
1372
  hparams.set_swa_pattern(5);
1366
1373
 
1367
1374
  hparams.n_layer_kv_from_start = 20;
1368
- hparams.rope_freq_base_train_swa = 10000.0f;
1369
- hparams.rope_freq_scale_train_swa = 1.0f;
1370
1375
  hparams.f_attention_scale = 1.0f;
1371
1376
 
1377
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1372
1378
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1373
1379
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1374
1380
 
@@ -1384,9 +1390,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1384
1390
  hparams.set_swa_pattern(6);
1385
1391
 
1386
1392
  hparams.causal_attn = false; // embeddings do not use causal attention
1387
- hparams.rope_freq_base_train_swa = 10000.0f;
1388
- hparams.rope_freq_scale_train_swa = 1.0f;
1389
1393
 
1394
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1390
1395
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1391
1396
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1392
1397
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1525,7 +1530,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1525
1530
  {
1526
1531
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1527
1532
  hparams.set_swa_pattern(4);
1533
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1534
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1528
1535
 
1536
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1529
1537
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1530
1538
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1531
1539
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1564,6 +1572,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1564
1572
  if (found_swa && hparams.n_swa > 0) {
1565
1573
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1566
1574
  hparams.set_swa_pattern(4);
1575
+
1576
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1577
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1578
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1567
1579
  } else {
1568
1580
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1569
1581
  }
@@ -1906,6 +1918,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1906
1918
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1907
1919
  hparams.n_swa = 4096;
1908
1920
  hparams.set_swa_pattern(4);
1921
+
1922
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1923
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1924
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1909
1925
  }
1910
1926
 
1911
1927
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -2208,6 +2224,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2208
2224
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2209
2225
  hparams.set_swa_pattern(2);
2210
2226
 
2227
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2228
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2229
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2230
+
2211
2231
  switch (hparams.n_layer) {
2212
2232
  case 24: type = LLM_TYPE_20B; break;
2213
2233
  case 36: type = LLM_TYPE_120B; break;
@@ -2252,6 +2272,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2252
2272
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2253
2273
  hparams.n_swa = 4096;
2254
2274
  hparams.set_swa_pattern(4, true);
2275
+
2276
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2277
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2278
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2255
2279
  } else {
2256
2280
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2257
2281
  hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -2416,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2416
2440
 
2417
2441
  const bool use_mmap_buffer = true;
2418
2442
 
2419
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2443
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2444
+ __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2420
2445
 
2421
2446
  // build a list of buffer types for the CPU and GPU devices
2422
2447
  pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2427,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2427
2452
  pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2428
2453
  }
2429
2454
 
2455
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2456
+ if (cpu_dev == nullptr) {
2457
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
2458
+ }
2459
+
2430
2460
  // calculate the split points
2431
2461
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2432
2462
  std::vector<float> splits(n_devices());
@@ -2437,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2437
2467
  size_t total;
2438
2468
  size_t free;
2439
2469
  ggml_backend_dev_memory(dev, &free, &total);
2470
+
2471
+ // devices can return 0 bytes for free and total memory if they do not
2472
+ // have any to report. in this case, we will use the host memory as a fallback
2473
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2474
+ if (free == 0 && total == 0) {
2475
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
2476
+ }
2440
2477
  splits[i] = free;
2441
2478
  }
2442
2479
  } else {
@@ -2453,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2453
2490
  splits[i] /= split_sum;
2454
2491
  }
2455
2492
 
2456
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2457
- if (cpu_dev == nullptr) {
2458
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
2459
- }
2460
2493
  const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2461
2494
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2462
2495
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -6446,6 +6479,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6446
6479
  layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
6447
6480
  }
6448
6481
  }
6482
+
6483
+ // for LFM2-ColBert-350M
6484
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
6449
6485
  } break;
6450
6486
  case LLM_ARCH_SMALLTHINKER:
6451
6487
  {
@@ -7098,6 +7134,10 @@ void llama_model::print_info() const {
7098
7134
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7099
7135
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7100
7136
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7137
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7138
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7139
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7140
+ }
7101
7141
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7102
7142
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7103
7143
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7910,12 +7950,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7910
7950
  // add on pooling layer
7911
7951
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7912
7952
 
7953
+ // add backend sampling layers (if any)
7954
+ llm->build_sampling();
7955
+
7913
7956
  // if the gguf model was converted with --sentence-transformers-dense-modules
7914
7957
  // there will be two additional dense projection layers
7915
7958
  // dense linear projections are applied after pooling
7916
7959
  // TODO: move reranking logic here and generalize
7917
7960
  llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7918
7961
 
7962
+ llm->res->set_outputs();
7963
+
7919
7964
  return llm->res->get_gf();
7920
7965
  }
7921
7966
 
@@ -7937,6 +7982,7 @@ llama_model_params llama_model_default_params() {
7937
7982
  /*.kv_overrides =*/ nullptr,
7938
7983
  /*.vocab_only =*/ false,
7939
7984
  /*.use_mmap =*/ true,
7985
+ /*.use_direct_io =*/ true,
7940
7986
  /*.use_mlock =*/ false,
7941
7987
  /*.check_tensors =*/ false,
7942
7988
  /*.use_extra_bufts =*/ true,
@@ -7971,6 +8017,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
7971
8017
  return model->hparams.n_embd_inp();
7972
8018
  }
7973
8019
 
8020
+ int32_t llama_model_n_embd_out(const llama_model * model) {
8021
+ return model->hparams.get_n_embd_out();
8022
+ }
8023
+
7974
8024
  int32_t llama_model_n_layer(const llama_model * model) {
7975
8025
  return model->hparams.n_layer;
7976
8026
  }
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());