@fugood/llama.node 1.4.12 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +11 -1
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +2 -0
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -9
  7. package/src/LlamaContext.cpp +5 -2
  8. package/src/llama.cpp/common/arg.cpp +249 -101
  9. package/src/llama.cpp/common/arg.h +0 -8
  10. package/src/llama.cpp/common/chat.cpp +4 -4
  11. package/src/llama.cpp/common/common.cpp +21 -1
  12. package/src/llama.cpp/common/common.h +20 -7
  13. package/src/llama.cpp/common/download.cpp +104 -55
  14. package/src/llama.cpp/common/download.h +26 -5
  15. package/src/llama.cpp/common/llguidance.cpp +10 -6
  16. package/src/llama.cpp/common/preset.cpp +76 -1
  17. package/src/llama.cpp/common/preset.h +10 -1
  18. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  19. package/src/llama.cpp/common/sampling.cpp +58 -14
  20. package/src/llama.cpp/common/sampling.h +3 -1
  21. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  22. package/src/llama.cpp/include/llama.h +92 -10
  23. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  24. package/src/llama.cpp/src/llama-arch.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +615 -28
  26. package/src/llama.cpp/src/llama-context.h +43 -1
  27. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  28. package/src/llama.cpp/src/llama-grammar.h +2 -0
  29. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  30. package/src/llama.cpp/src/llama-graph.h +71 -6
  31. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  32. package/src/llama.cpp/src/llama-hparams.h +8 -2
  33. package/src/llama.cpp/src/llama-mmap.cpp +70 -37
  34. package/src/llama.cpp/src/llama-mmap.h +5 -4
  35. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  36. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  37. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  38. package/src/llama.cpp/src/llama-model.cpp +66 -16
  39. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  40. package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
  41. package/src/llama.cpp/src/llama-sampling.h +16 -7
  42. package/src/llama.cpp/src/llama.cpp +101 -57
  43. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  44. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  45. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  46. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  47. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  48. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  49. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -14,7 +14,16 @@ struct llama_grammar;
14
14
  struct llama_sampler_chain {
15
15
  llama_sampler_chain_params params;
16
16
 
17
- std::vector<struct llama_sampler *> samplers;
17
+ // has .backend_init() been called?
18
+ bool is_init = false;
19
+
20
+ struct info {
21
+ bool is_backend;
22
+
23
+ llama_sampler * ptr;
24
+ };
25
+
26
+ std::vector<info> samplers;
18
27
 
19
28
  // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
20
29
  std::vector<llama_token_data> cur;
@@ -27,9 +36,9 @@ struct llama_sampler_chain {
27
36
  };
28
37
 
29
38
  struct llama_sampler * llama_sampler_init_dry_testing(
30
- int32_t context_size,
31
- float dry_multiplier,
32
- float dry_base,
33
- int32_t dry_allowed_length,
34
- int32_t dry_penalty_last_n,
35
- const std::vector<std::vector<llama_token>>& seq_breakers);
39
+ int32_t context_size,
40
+ float dry_multiplier,
41
+ float dry_base,
42
+ int32_t dry_allowed_length,
43
+ int32_t dry_penalty_last_n,
44
+ const std::vector<std::vector<llama_token>> & seq_breakers);
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
111
111
  }
112
112
  }
113
113
  for (size_t i = 0; i < ret.size(); i++) {
114
- size_t free, total;
114
+ size_t free;
115
+ size_t total;
115
116
  ggml_backend_dev_memory(model->devices[i], &free, &total);
117
+
118
+ // devices can return 0 bytes for free and total memory if they do not
119
+ // have any to report. in this case, we will use the host memory as a fallback
120
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
121
+ if (free == 0 && total == 0) {
122
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
123
+ if (cpu_dev == nullptr) {
124
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
125
+ }
126
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
127
+ }
116
128
  ret[i].free = free;
117
129
  ret[i].total = total;
118
130
  }
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
147
159
  static void llama_params_fit_impl(
148
160
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
149
161
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
150
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
162
+ size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
151
163
  constexpr int64_t MiB = 1024*1024;
152
- const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
153
164
  typedef std::vector<llama_device_memory_data> dmds_t;
154
165
  const llama_model_params default_mparams = llama_model_default_params();
155
166
 
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
168
179
  return;
169
180
  }
170
181
 
182
+ std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
183
+ margins.reserve(nd);
184
+ for (size_t id = 0; id < nd; id++) {
185
+ margins.push_back(margins_s[id]);
186
+ }
187
+
171
188
  std::vector<std::string> dev_names;
172
189
  {
173
190
  dev_names.reserve(nd);
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
187
204
 
188
205
  int64_t sum_free = 0;
189
206
  int64_t sum_projected_free = 0;
190
- int64_t min_projected_free = INT64_MAX;
191
207
  int64_t sum_projected_used = 0;
192
208
  int64_t sum_projected_model = 0;
209
+ std::vector<int64_t> projected_free_per_device;
210
+ projected_free_per_device.reserve(nd);
193
211
 
194
212
  if (nd > 1) {
195
213
  LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
199
217
 
200
218
  const int64_t projected_used = dmd.mb.total();
201
219
  const int64_t projected_free = dmd.free - projected_used;
220
+ projected_free_per_device.push_back(projected_free);
202
221
 
203
222
  sum_free += dmd.free;
204
223
  sum_projected_used += projected_used;
205
224
  sum_projected_free += projected_free;
206
- min_projected_free = std::min(min_projected_free, projected_free);
207
225
  sum_projected_model += dmd.mb.model;
208
226
 
209
227
  if (nd > 1) {
210
- LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
211
- __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
212
- projected_free >= 0 ? "surplus" : "deficit");
228
+ LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
229
+ __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
213
230
  }
214
231
  }
215
232
  assert(sum_free >= 0 && sum_projected_used >= 0);
216
233
  LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
217
234
  __func__, sum_projected_used/MiB, sum_free/MiB);
218
- if (min_projected_free >= margin) {
219
- if (nd == 1) {
235
+ if (nd == 1) {
236
+ if (projected_free_per_device[0] >= margins[0]) {
220
237
  LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
221
- __func__, min_projected_free/MiB, margin/MiB);
238
+ __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
239
+ return;
240
+ }
241
+ } else {
242
+ bool changes_needed = false;
243
+ for (size_t id = 0; id < nd; id++) {
244
+ if (projected_free_per_device[id] < margins[id]) {
245
+ changes_needed = true;
246
+ break;
247
+ }
248
+ }
249
+ if (!changes_needed) {
250
+ LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
222
251
  return;
223
252
  }
224
- LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
225
- __func__, min_projected_free/MiB, margin/MiB);
226
- return;
227
253
  }
228
254
 
229
255
  // step 2: try reducing memory use by reducing the context size
230
256
 
231
257
  {
232
- int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
258
+ int64_t global_surplus = sum_projected_free;
259
+ for (size_t id = 0; id < nd; id++) {
260
+ global_surplus -= margins[id];
261
+ }
233
262
  if (global_surplus < 0) {
234
- LLAMA_LOG_INFO(nd == 1 ?
235
- "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
236
- "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
237
- __func__, margin/MiB, -global_surplus/MiB);
263
+ if (nd == 1) {
264
+ LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
265
+ __func__, margins[0]/MiB, -global_surplus/MiB);
266
+ } else {
267
+ LLAMA_LOG_INFO(
268
+ "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
269
+ __func__, -global_surplus/MiB);
270
+ }
238
271
  if (cparams->n_ctx == 0) {
239
272
  if (hp_nct > n_ctx_min) {
240
- int64_t sum_used_target = sum_free - nd*margin_s;
273
+ int64_t sum_used_target = sum_free;
274
+ for (size_t id = 0; id < nd; id++) {
275
+ sum_used_target -= margins[id];
276
+ }
241
277
  if (nd > 1) {
242
278
  // for multiple devices we need to be more conservative in terms of how much context we think can fit:
243
279
  // - for dense models only whole layers can be assigned to devices
@@ -359,6 +395,11 @@ static void llama_params_fit_impl(
359
395
 
360
396
  // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
361
397
  layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
398
+
399
+ uint32_t n_full() const {
400
+ assert(n_layer >= n_part);
401
+ return n_layer - n_part;
402
+ }
362
403
  };
363
404
 
364
405
  const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -382,7 +423,7 @@ static void llama_params_fit_impl(
382
423
 
383
424
  size_t itbo = 0;
384
425
  for (size_t id = 0; id < nd; id++) {
385
- il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
426
+ il0 += ngl_per_device[id].n_full();
386
427
  for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
387
428
  if (itbo + 1 >= ntbo) {
388
429
  tensor_buft_overrides[itbo].pattern = nullptr;
@@ -393,7 +434,7 @@ static void llama_params_fit_impl(
393
434
  + std::to_string(ntbo) + " is insufficient for model");
394
435
  }
395
436
  tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
396
- tensor_buft_overrides[itbo].buft = overflow_bufts[id];
437
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
397
438
  itbo++;
398
439
  }
399
440
  il0 += ngl_per_device[id].n_part;
@@ -443,9 +484,9 @@ static void llama_params_fit_impl(
443
484
  const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
444
485
  path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
445
486
 
446
- for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
447
- global_surplus_cpu_moe += dmd.free;
448
- global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
487
+ for (size_t id = 0; id < nd; id++) {
488
+ global_surplus_cpu_moe += dmds_cpu_moe[id].free;
489
+ global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
449
490
  }
450
491
 
451
492
  if (global_surplus_cpu_moe > 0) {
@@ -464,24 +505,18 @@ static void llama_params_fit_impl(
464
505
  std::vector<int64_t> targets; // maximum acceptable memory use per device
465
506
  targets.reserve(nd);
466
507
  for (size_t id = 0; id < nd; id++) {
467
- targets.push_back(dmds_full[id].free - margin);
508
+ targets.push_back(dmds_full[id].free - margins[id]);
468
509
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
469
510
  }
470
511
 
471
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
512
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
472
513
  overflow_bufts.reserve(nd);
473
- for (size_t id = 0; id < nd - 1; ++id) {
474
- overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
514
+ for (size_t id = 0; id < nd; id++) {
515
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
475
516
  }
476
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
477
517
 
478
518
  std::vector<ngl_t> ngl_per_device(nd);
479
519
  std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
480
- if (hp_nex > 0) {
481
- for (size_t id = 0; id < nd; id++) {
482
- ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
483
- }
484
- }
485
520
 
486
521
  // optimize the number of layers per device using the method of false position:
487
522
  // - ngl_per_device has 0 layers for each device, lower bound
@@ -512,9 +547,6 @@ static void llama_params_fit_impl(
512
547
  if (mem_high[id] > targets[id]) {
513
548
  assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
514
549
  uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
515
- if (hp_nex > 0 && size_t(id) == nd - 1) {
516
- delta--;
517
- }
518
550
  LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
519
551
  while (delta > 1) {
520
552
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
@@ -524,7 +556,8 @@ static void llama_params_fit_impl(
524
556
  std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
525
557
  ngl_per_device_test[id].n_layer += step_size;
526
558
  if (hp_nex) {
527
- ngl_per_device_test[id].n_part += step_size;
559
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
560
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
528
561
  }
529
562
  const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
530
563
 
@@ -573,7 +606,7 @@ static void llama_params_fit_impl(
573
606
  assert(id_dense_start < nd);
574
607
 
575
608
  LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
576
- for (size_t id = 0; id <= id_dense_start; id++) {
609
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
577
610
  std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
578
611
  for (size_t jd = id_dense_start; jd < nd; jd++) {
579
612
  const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
@@ -585,12 +618,8 @@ static void llama_params_fit_impl(
585
618
  std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
586
619
 
587
620
  if (mem_high[id] > targets[id]) {
588
- assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
589
- assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
590
- assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
591
- >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
592
- uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
593
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
621
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
622
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
594
623
  while (delta > 1) {
595
624
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
596
625
  step_size = std::max(step_size, uint32_t(1));
@@ -606,7 +635,7 @@ static void llama_params_fit_impl(
606
635
  ngl_per_device_test[id].n_layer += n_convert_jd;
607
636
  n_converted_test += n_convert_jd;
608
637
 
609
- if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
638
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
610
639
  break;
611
640
  }
612
641
  }
@@ -625,8 +654,8 @@ static void llama_params_fit_impl(
625
654
  LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
626
655
  __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
627
656
  }
628
- delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
629
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
657
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
658
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
630
659
  }
631
660
  } else {
632
661
  ngl_per_device = ngl_per_device_high;
@@ -644,14 +673,19 @@ static void llama_params_fit_impl(
644
673
  ngl_per_device_test[id_dense_start_test].n_part--;
645
674
  ngl_per_device_test[id].n_layer++;
646
675
  ngl_per_device_test[id].n_part++;
647
- if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
676
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
648
677
  id_dense_start_test++;
649
678
  }
650
679
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
680
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
681
+ if (id < nd - 1) {
682
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
683
+ }
651
684
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
652
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
685
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
653
686
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
654
687
  ngl_per_device = ngl_per_device_test;
688
+ overflow_bufts = overflow_bufts_test;
655
689
  mem = mem_test;
656
690
  id_dense_start = id_dense_start_test;
657
691
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -659,9 +693,10 @@ static void llama_params_fit_impl(
659
693
 
660
694
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
661
695
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
662
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
696
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
663
697
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
664
698
  ngl_per_device = ngl_per_device_test;
699
+ overflow_bufts = overflow_bufts_test;
665
700
  mem = mem_test;
666
701
  id_dense_start = id_dense_start_test;
667
702
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -670,9 +705,10 @@ static void llama_params_fit_impl(
670
705
  } else {
671
706
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
672
707
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
673
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
708
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
674
709
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
675
710
  ngl_per_device = ngl_per_device_test;
711
+ overflow_bufts = overflow_bufts_test;
676
712
  mem = mem_test;
677
713
  id_dense_start = id_dense_start_test;
678
714
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -687,17 +723,25 @@ static void llama_params_fit_impl(
687
723
  __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
688
724
  }
689
725
 
726
+ // print info for devices that were not changed during the conversion from dense only to full layers:
727
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
728
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
729
+ LLAMA_LOG_INFO(
730
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
731
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
732
+ }
733
+
690
734
  set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
691
735
  }
692
736
 
693
737
  enum llama_params_fit_status llama_params_fit(
694
738
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
695
739
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
696
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
740
+ size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
697
741
  const int64_t t0_us = llama_time_us();
698
742
  llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
699
743
  try {
700
- llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
744
+ llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
701
745
  LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
702
746
  } catch (const llama_params_fit_exception & e) {
703
747
  LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
@@ -713,7 +757,7 @@ enum llama_params_fit_status llama_params_fit(
713
757
 
714
758
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
715
759
  struct llama_sampler_chain_params result = {
716
- /*.no_perf =*/ true,
760
+ /*.no_perf =*/ true,
717
761
  };
718
762
 
719
763
  return result;
@@ -786,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
786
830
  model.t_start_us = tm.t_start_us;
787
831
 
788
832
  try {
789
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
833
+ llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
790
834
 
791
835
  ml.print_info();
792
836
 
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
22
22
  const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
23
23
 
24
24
  for (int il = 0; il < n_layer; ++il) {
25
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
27
+
25
28
  ggml_tensor * inpSA = inpL;
26
29
 
30
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
31
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
32
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
33
+
27
34
  // dual attention normalization (pre)
28
35
  cur = build_norm(inpL,
29
36
  model.layers[il].attn_norm, NULL,
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
56
63
  cb(Qcur, "Qcur_normed", il);
57
64
  cb(Kcur, "Kcur_normed", il);
58
65
 
59
- // RoPE only for sliding_attention layers
60
- const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
61
- ((il + 1) % hparams.n_no_rope_layer_step) != 0;
62
66
  if (use_rope) {
63
67
  Qcur = ggml_rope_ext(
64
68
  ctx0, Qcur, inp_pos, nullptr,
65
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
66
70
  ext_factor, attn_factor, beta_fast, beta_slow);
67
71
  cb(Qcur, "Qcur_rope", il);
68
72
 
69
73
  Kcur = ggml_rope_ext(
70
74
  ctx0, Kcur, inp_pos, nullptr,
71
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
75
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
72
76
  ext_factor, attn_factor, beta_fast, beta_slow);
73
77
  cb(Kcur, "Kcur_rope", il);
74
78
  }
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
21
21
 
22
22
  for (int il = 0; il < n_layer; ++il) {
23
23
  const bool is_swa = hparams.is_swa(il);
24
+ // UNUSED:
25
+ // const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
27
 
25
28
  // norm
26
29
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
19
19
  ggml_tensor * inp_out_ids = build_inp_out_ids();
20
20
 
21
21
  for (int il = 0; il < n_layer; ++il) {
22
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
23
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
+
22
25
  // norm
23
26
  cur = build_norm(inpL,
24
27
  model.layers[il].attn_norm, NULL,
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
43
46
 
44
47
  Qcur = ggml_rope_ext(
45
48
  ctx0, Qcur, inp_pos, nullptr,
46
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
49
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
47
50
  ext_factor, attn_factor, beta_fast, beta_slow);
48
51
 
49
52
  Kcur = ggml_rope_ext(
50
53
  ctx0, Kcur, inp_pos, nullptr,
51
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
54
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
55
  ext_factor, attn_factor, beta_fast, beta_slow);
53
56
 
54
57
  cb(Qcur, "Qcur", il);
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
25
25
  ggml_tensor * inp_out_ids = build_inp_out_ids();
26
26
 
27
27
  for (int il = 0; il < n_layer; ++il) {
28
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
29
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
30
+
28
31
  ggml_tensor * inpSA = inpL;
29
32
 
33
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
30
34
  const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
31
35
  (il + 1) % hparams.n_no_rope_layer_step != 0;
32
36
 
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
67
71
  if (use_rope) {
68
72
  Qcur = ggml_rope_ext(
69
73
  ctx0, Qcur, inp_pos, rope_factors,
70
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
74
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
71
75
  ext_factor, attn_factor, beta_fast, beta_slow
72
76
  );
73
77
 
74
78
  Kcur = ggml_rope_ext(
75
79
  ctx0, Kcur, inp_pos, rope_factors,
76
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
80
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
77
81
  ext_factor, attn_factor, beta_fast, beta_slow
78
82
  );
79
83
  } else if (inp_attn_scale) {
@@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
23
23
  auto * inp_attn = build_attn_inp_no_cache();
24
24
 
25
25
  for (int il = 0; il < n_layer; ++il) {
26
- float freq_base_l = model.get_rope_freq_base(cparams, il);
26
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
27
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
27
28
 
28
29
  cur = inpL;
29
30
 
@@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
48
49
  // RoPE
49
50
  Qcur = ggml_rope_ext(
50
51
  ctx0, Qcur, inp_pos, nullptr,
51
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
52
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
53
  ext_factor, attn_factor, beta_fast, beta_slow
53
54
  );
54
55
 
55
56
  Kcur = ggml_rope_ext(
56
57
  ctx0, Kcur, inp_pos, nullptr,
57
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
58
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
58
59
  ext_factor, attn_factor, beta_fast, beta_slow
59
60
  );
60
61
 
@@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
14
14
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15
15
 
16
16
  for (int il = 0; il < n_layer; ++il) {
17
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
18
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
19
+
17
20
  ggml_tensor * inpSA = inpL;
18
21
 
19
22
  // norm
@@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
49
52
 
50
53
  Qcur = ggml_rope_ext(
51
54
  ctx0, Qcur, inp_pos, nullptr,
52
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
53
56
  ext_factor, attn_factor, beta_fast, beta_slow
54
57
  );
55
58
 
56
59
  Kcur = ggml_rope_ext(
57
60
  ctx0, Kcur, inp_pos, nullptr,
58
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
59
62
  ext_factor, attn_factor, beta_fast, beta_slow
60
63
  );
61
64
 
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
26
26
  ggml_tensor * inp_out_ids = build_inp_out_ids();
27
27
 
28
28
  for (int il = 0; il < n_layer; ++il) {
29
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
30
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
31
+
29
32
  ggml_tensor * inpSA = inpL;
30
- ggml_tensor * probs = nullptr;
31
33
 
32
- probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
34
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
35
+ const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
36
+ il % hparams.n_no_rope_layer_step != 0;
37
+
38
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
33
39
  cb(probs, "ffn_moe_logits", il);
34
40
 
35
41
  // norm
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
52
58
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
53
59
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
54
60
 
55
- if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
56
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61
+ if (use_rope) {
62
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
57
63
  ext_factor, attn_factor, beta_fast, beta_slow);
58
64
 
59
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
65
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
60
66
  ext_factor, attn_factor, beta_fast, beta_slow);
61
67
  }
62
68
  cb(Qcur, "Qcur", il);