@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -140,6 +140,10 @@ enum layer_fraction_t {
140
140
  };
141
141
  // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
142
142
 
143
+ class llama_params_fit_exception : public std::runtime_error {
144
+ using std::runtime_error::runtime_error;
145
+ };
146
+
143
147
  static void llama_params_fit_impl(
144
148
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
145
149
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -181,12 +185,11 @@ static void llama_params_fit_impl(
181
185
  }
182
186
  }
183
187
 
184
- int64_t sum_total = 0;
188
+ int64_t sum_free = 0;
185
189
  int64_t sum_projected_free = 0;
186
190
  int64_t min_projected_free = INT64_MAX;
187
191
  int64_t sum_projected_used = 0;
188
192
  int64_t sum_projected_model = 0;
189
- int64_t sum_projected_ctx = 0;
190
193
 
191
194
  if (nd > 1) {
192
195
  LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -197,12 +200,11 @@ static void llama_params_fit_impl(
197
200
  const int64_t projected_used = dmd.mb.total();
198
201
  const int64_t projected_free = dmd.free - projected_used;
199
202
 
200
- sum_total += dmd.total;
203
+ sum_free += dmd.free;
201
204
  sum_projected_used += projected_used;
202
205
  sum_projected_free += projected_free;
203
206
  min_projected_free = std::min(min_projected_free, projected_free);
204
207
  sum_projected_model += dmd.mb.model;
205
- sum_projected_ctx += dmd.mb.context;
206
208
 
207
209
  if (nd > 1) {
208
210
  LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
@@ -210,10 +212,9 @@ static void llama_params_fit_impl(
210
212
  projected_free >= 0 ? "surplus" : "deficit");
211
213
  }
212
214
  }
213
- assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
214
- assert(sum_projected_used >= sum_projected_ctx);
215
+ assert(sum_free >= 0 && sum_projected_used >= 0);
215
216
  LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
216
- __func__, sum_projected_used/MiB, sum_total/MiB);
217
+ __func__, sum_projected_used/MiB, sum_free/MiB);
217
218
  if (min_projected_free >= margin) {
218
219
  if (nd == 1) {
219
220
  LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
@@ -236,9 +237,7 @@ static void llama_params_fit_impl(
236
237
  __func__, margin/MiB, -global_surplus/MiB);
237
238
  if (cparams->n_ctx == 0) {
238
239
  if (hp_nct > n_ctx_min) {
239
- const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
240
-
241
- int64_t memory_reduction = -global_surplus;
240
+ int64_t sum_used_target = sum_free - nd*margin_s;
242
241
  if (nd > 1) {
243
242
  // for multiple devices we need to be more conservative in terms of how much context we think can fit:
244
243
  // - for dense models only whole layers can be assigned to devices
@@ -246,24 +245,34 @@ static void llama_params_fit_impl(
246
245
  // - on average we expect a waste of 0.5 layers/tensors per device
247
246
  // - use slightly more than the expected average for nd devices to be safe
248
247
  const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
249
- memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
248
+ sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
250
249
  }
251
250
 
252
- uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
253
- cparams->n_ctx = hp_nct - ctx_reduction;
254
- cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
255
-
256
- ctx_reduction = hp_nct - cparams->n_ctx;
257
- memory_reduction = ctx_reduction * bytes_per_ctx;
258
- global_surplus += memory_reduction;
259
- LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
260
- __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
261
- if (global_surplus >= 0) {
251
+ int64_t sum_projected_used_min_ctx = 0;
252
+ cparams->n_ctx = n_ctx_min;
253
+ const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
254
+ for (const auto & dmd : dmds_min_ctx) {
255
+ sum_projected_used_min_ctx += dmd.mb.total();
256
+ }
257
+ if (sum_used_target > sum_projected_used_min_ctx) {
258
+ // linear interpolation between minimum and maximum context size:
259
+ cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
260
+ / (sum_projected_used - sum_projected_used_min_ctx);
261
+ cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
262
+
263
+ const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
264
+ const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
265
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
266
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
262
267
  if (nd == 1) {
263
268
  LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
264
269
  return;
265
270
  }
266
271
  LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
272
+ } else {
273
+ const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
274
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
275
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
267
276
  }
268
277
  } else {
269
278
  LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
@@ -276,28 +285,28 @@ static void llama_params_fit_impl(
276
285
  }
277
286
 
278
287
  if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
279
- throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
288
+ throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
280
289
  }
281
290
  if (nd > 1) {
282
291
  if (!tensor_split) {
283
- throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
292
+ throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
284
293
  }
285
294
  if (mparams->tensor_split) {
286
295
  for (size_t id = 0; id < nd; id++) {
287
296
  if (mparams->tensor_split[id] != 0.0f) {
288
- throw std::runtime_error("model_params::tensor_split already set by user, abort");
297
+ throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
289
298
  }
290
299
  }
291
300
  }
292
301
  if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
293
- throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
302
+ throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
294
303
  }
295
304
  }
296
305
  if (!tensor_buft_overrides) {
297
- throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
306
+ throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
298
307
  }
299
308
  if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
300
- throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
309
+ throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
301
310
  }
302
311
 
303
312
  // step 3: iteratively fill the back to front with "dense" layers
@@ -350,6 +359,11 @@ static void llama_params_fit_impl(
350
359
 
351
360
  // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
352
361
  layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
362
+
363
+ uint32_t n_full() const {
364
+ assert(n_layer >= n_part);
365
+ return n_layer - n_part;
366
+ }
353
367
  };
354
368
 
355
369
  const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -373,18 +387,18 @@ static void llama_params_fit_impl(
373
387
 
374
388
  size_t itbo = 0;
375
389
  for (size_t id = 0; id < nd; id++) {
376
- il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
390
+ il0 += ngl_per_device[id].n_full();
377
391
  for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
378
392
  if (itbo + 1 >= ntbo) {
379
393
  tensor_buft_overrides[itbo].pattern = nullptr;
380
394
  tensor_buft_overrides[itbo].buft = nullptr;
381
395
  itbo++;
382
396
  mparams.tensor_buft_overrides = tensor_buft_overrides;
383
- throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
384
- + std::to_string(ntbo) + " is insufficient for model\n");
397
+ throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
398
+ + std::to_string(ntbo) + " is insufficient for model");
385
399
  }
386
400
  tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
387
- tensor_buft_overrides[itbo].buft = overflow_bufts[id];
401
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
388
402
  itbo++;
389
403
  }
390
404
  il0 += ngl_per_device[id].n_part;
@@ -459,20 +473,14 @@ static void llama_params_fit_impl(
459
473
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
460
474
  }
461
475
 
462
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
476
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
463
477
  overflow_bufts.reserve(nd);
464
- for (size_t id = 0; id < nd - 1; ++id) {
465
- overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
478
+ for (size_t id = 0; id < nd; id++) {
479
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
466
480
  }
467
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
468
481
 
469
482
  std::vector<ngl_t> ngl_per_device(nd);
470
483
  std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
471
- if (hp_nex > 0) {
472
- for (size_t id = 0; id < nd; id++) {
473
- ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
474
- }
475
- }
476
484
 
477
485
  // optimize the number of layers per device using the method of false position:
478
486
  // - ngl_per_device has 0 layers for each device, lower bound
@@ -512,7 +520,8 @@ static void llama_params_fit_impl(
512
520
  std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
513
521
  ngl_per_device_test[id].n_layer += step_size;
514
522
  if (hp_nex) {
515
- ngl_per_device_test[id].n_part += step_size;
523
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
524
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
516
525
  }
517
526
  const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
518
527
 
@@ -561,7 +570,7 @@ static void llama_params_fit_impl(
561
570
  assert(id_dense_start < nd);
562
571
 
563
572
  LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
564
- for (size_t id = 0; id <= id_dense_start; id++) {
573
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
565
574
  std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
566
575
  for (size_t jd = id_dense_start; jd < nd; jd++) {
567
576
  const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
@@ -573,12 +582,8 @@ static void llama_params_fit_impl(
573
582
  std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
574
583
 
575
584
  if (mem_high[id] > targets[id]) {
576
- assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
577
- assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
578
- assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
579
- >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
580
- uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
581
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
585
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
586
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
582
587
  while (delta > 1) {
583
588
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
584
589
  step_size = std::max(step_size, uint32_t(1));
@@ -594,7 +599,7 @@ static void llama_params_fit_impl(
594
599
  ngl_per_device_test[id].n_layer += n_convert_jd;
595
600
  n_converted_test += n_convert_jd;
596
601
 
597
- if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
602
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
598
603
  break;
599
604
  }
600
605
  }
@@ -613,8 +618,8 @@ static void llama_params_fit_impl(
613
618
  LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
614
619
  __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
615
620
  }
616
- delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
617
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
621
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
622
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
618
623
  }
619
624
  } else {
620
625
  ngl_per_device = ngl_per_device_high;
@@ -632,14 +637,19 @@ static void llama_params_fit_impl(
632
637
  ngl_per_device_test[id_dense_start_test].n_part--;
633
638
  ngl_per_device_test[id].n_layer++;
634
639
  ngl_per_device_test[id].n_part++;
635
- if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
640
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
636
641
  id_dense_start_test++;
637
642
  }
638
643
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
644
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
645
+ if (id < nd - 1) {
646
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
647
+ }
639
648
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
640
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
641
- if (mem_test[id] < targets[id]) {
649
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
650
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
642
651
  ngl_per_device = ngl_per_device_test;
652
+ overflow_bufts = overflow_bufts_test;
643
653
  mem = mem_test;
644
654
  id_dense_start = id_dense_start_test;
645
655
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -647,9 +657,10 @@ static void llama_params_fit_impl(
647
657
 
648
658
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
649
659
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
650
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
651
- if (mem_test[id] < targets[id]) {
660
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
661
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
652
662
  ngl_per_device = ngl_per_device_test;
663
+ overflow_bufts = overflow_bufts_test;
653
664
  mem = mem_test;
654
665
  id_dense_start = id_dense_start_test;
655
666
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -658,9 +669,10 @@ static void llama_params_fit_impl(
658
669
  } else {
659
670
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
660
671
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
661
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
662
- if (mem_test[id] < targets[id]) {
672
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
673
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
663
674
  ngl_per_device = ngl_per_device_test;
675
+ overflow_bufts = overflow_bufts_test;
664
676
  mem = mem_test;
665
677
  id_dense_start = id_dense_start_test;
666
678
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -675,30 +687,41 @@ static void llama_params_fit_impl(
675
687
  __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
676
688
  }
677
689
 
690
+ // print info for devices that were not changed during the conversion from dense only to full layers:
691
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
692
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
693
+ LLAMA_LOG_INFO(
694
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
695
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
696
+ }
697
+
678
698
  set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
679
699
  }
680
700
 
681
- bool llama_params_fit(
701
+ enum llama_params_fit_status llama_params_fit(
682
702
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
683
703
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
684
704
  size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
685
705
  const int64_t t0_us = llama_time_us();
686
- bool ok = true;
706
+ llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
687
707
  try {
688
708
  llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
689
709
  LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
690
- } catch (const std::runtime_error & e) {
710
+ } catch (const llama_params_fit_exception & e) {
691
711
  LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
692
- ok = false;
712
+ status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
713
+ } catch (const std::runtime_error & e) {
714
+ LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
715
+ status = LLAMA_PARAMS_FIT_STATUS_ERROR;
693
716
  }
694
717
  const int64_t t1_us = llama_time_us();
695
718
  LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
696
- return ok;
719
+ return status;
697
720
  }
698
721
 
699
722
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
700
723
  struct llama_sampler_chain_params result = {
701
- /*.no_perf =*/ true,
724
+ /*.no_perf =*/ true,
702
725
  };
703
726
 
704
727
  return result;
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
22
22
  const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
23
23
 
24
24
  for (int il = 0; il < n_layer; ++il) {
25
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
27
+
25
28
  ggml_tensor * inpSA = inpL;
26
29
 
30
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
31
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
32
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
33
+
27
34
  // dual attention normalization (pre)
28
35
  cur = build_norm(inpL,
29
36
  model.layers[il].attn_norm, NULL,
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
56
63
  cb(Qcur, "Qcur_normed", il);
57
64
  cb(Kcur, "Kcur_normed", il);
58
65
 
59
- // RoPE only for sliding_attention layers
60
- const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
61
- ((il + 1) % hparams.n_no_rope_layer_step) != 0;
62
66
  if (use_rope) {
63
67
  Qcur = ggml_rope_ext(
64
68
  ctx0, Qcur, inp_pos, nullptr,
65
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
66
70
  ext_factor, attn_factor, beta_fast, beta_slow);
67
71
  cb(Qcur, "Qcur_rope", il);
68
72
 
69
73
  Kcur = ggml_rope_ext(
70
74
  ctx0, Kcur, inp_pos, nullptr,
71
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
75
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
72
76
  ext_factor, attn_factor, beta_fast, beta_slow);
73
77
  cb(Kcur, "Kcur_rope", il);
74
78
  }
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
142
142
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
143
143
  cb(cur, "ffn_out", il);
144
144
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
145
+ const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
146
+ auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
145
147
  cur = build_ffn(cur,
146
- model.layers[il].ffn_up, NULL, NULL,
148
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
147
149
  model.layers[il].ffn_gate, NULL, NULL,
148
150
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
149
- model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
151
+ type_op, LLM_FFN_PAR, il);
150
152
  cb(cur, "ffn_out", il);
151
153
  } else {
152
154
  cur = build_ffn(cur,
@@ -3,12 +3,14 @@
3
3
  llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
4
4
  llm_graph_context(params) {
5
5
  const int64_t n_embd_head = hparams.n_embd_head_v;
6
- float kq_scale = 1.0f / sqrtf(float(n_embd_head));
6
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
7
7
 
8
8
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9
9
  GGML_ASSERT(n_embd_head == hparams.n_rot);
10
10
 
11
- ggml_tensor *inpL, *cur;
11
+ ggml_tensor * inpL;
12
+ ggml_tensor * cur;
13
+
12
14
  inpL = build_inp_embd(model.tok_embd);
13
15
 
14
16
  ggml_tensor * inp_pos = build_inp_pos();
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
44
46
  }
45
47
 
46
48
  ggml_tensor * inpSA = inpL;
47
- cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
49
+ cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
48
50
 
49
51
  // build self attention
50
52
  {
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
21
21
 
22
22
  for (int il = 0; il < n_layer; ++il) {
23
23
  const bool is_swa = hparams.is_swa(il);
24
+ // UNUSED:
25
+ // const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
27
 
25
28
  // norm
26
29
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
215
215
  model.layers[il].ffn_exp_probs_b,
216
216
  n_expert, n_expert_used,
217
217
  LLM_FFN_SILU, hparams.expert_weights_norm,
218
- true, hparams.expert_weights_scale,
218
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
219
219
  (llama_expert_gating_func_type) hparams.expert_gating_func,
220
220
  il);
221
221
  cb(moe_out, "ffn_moe_out", il);
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params) {
7
5
  const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
12
10
  inpL = build_inp_embd(model.tok_embd);
13
11
 
14
12
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
15
- if (ubatch.token) {
16
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
17
- cb(inpL, "inp_scaled", -1);
18
- }
13
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
14
+ cb(inpL, "inp_scaled", -1);
19
15
 
20
16
  // inp_pos - contains the positions
21
17
  ggml_tensor * inp_pos = build_inp_pos();
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
19
19
  ggml_tensor * inp_out_ids = build_inp_out_ids();
20
20
 
21
21
  for (int il = 0; il < n_layer; ++il) {
22
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
23
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
+
22
25
  // norm
23
26
  cur = build_norm(inpL,
24
27
  model.layers[il].attn_norm, NULL,
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
43
46
 
44
47
  Qcur = ggml_rope_ext(
45
48
  ctx0, Qcur, inp_pos, nullptr,
46
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
49
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
47
50
  ext_factor, attn_factor, beta_fast, beta_slow);
48
51
 
49
52
  Kcur = ggml_rope_ext(
50
53
  ctx0, Kcur, inp_pos, nullptr,
51
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
54
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
55
  ext_factor, attn_factor, beta_fast, beta_slow);
53
56
 
54
57
  cb(Qcur, "Qcur", il);
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
10
10
  inpL = build_inp_embd(model.tok_embd);
11
11
 
12
12
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
13
- if (ubatch.token) {
14
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
15
- cb(inpL, "inp_scaled", -1);
16
- }
13
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
14
+ cb(inpL, "inp_scaled", -1);
15
+
17
16
  // inp_pos - contains the positions
18
17
  ggml_tensor * inp_pos = build_inp_pos();
19
18
 
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params),
7
5
  model(model),
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
15
13
  inpL = build_inp_embd(model.tok_embd);
16
14
 
17
15
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
18
- if (ubatch.token) {
19
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
20
- cb(inpL, "inp_scaled", -1);
21
- }
16
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
17
+ cb(inpL, "inp_scaled", -1);
18
+
22
19
  // inp_pos - contains the positions
23
20
  ggml_tensor * inp_pos = build_inp_pos();
24
21
 
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
248
245
  // equivalent to get_per_layer_inputs() in python code
249
246
  // output shape: [n_embd_altup, n_layer, n_tokens]
250
247
  ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
251
- auto inp = std::make_unique<llm_graph_input_embd>();
248
+ auto inp = std::make_unique<llm_graph_input_embd>();
252
249
  ggml_tensor * inp_per_layer;
253
250
  if (ubatch.token) {
254
251
  inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
25
25
  ggml_tensor * inp_out_ids = build_inp_out_ids();
26
26
 
27
27
  for (int il = 0; il < n_layer; ++il) {
28
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
29
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
30
+
28
31
  ggml_tensor * inpSA = inpL;
29
32
 
33
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
30
34
  const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
31
35
  (il + 1) % hparams.n_no_rope_layer_step != 0;
32
36
 
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
67
71
  if (use_rope) {
68
72
  Qcur = ggml_rope_ext(
69
73
  ctx0, Qcur, inp_pos, rope_factors,
70
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
74
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
71
75
  ext_factor, attn_factor, beta_fast, beta_slow
72
76
  );
73
77
 
74
78
  Kcur = ggml_rope_ext(
75
79
  ctx0, Kcur, inp_pos, rope_factors,
76
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
80
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
77
81
  ext_factor, attn_factor, beta_fast, beta_slow
78
82
  );
79
83
  } else if (inp_attn_scale) {
@@ -1,6 +1,7 @@
1
1
  #include "models.h"
2
2
 
3
- llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
3
+ template <bool embed>
4
+ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
5
  const int64_t n_embd_head = hparams.n_embd_head_v;
5
6
 
6
7
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
14
15
  // inp_pos - contains the positions
15
16
  ggml_tensor * inp_pos = build_inp_pos();
16
17
 
17
- auto * inp_attn = build_attn_inp_kv();
18
+ using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
19
+
20
+ inp_attn_type * inp_attn = nullptr;
21
+ if constexpr (embed) {
22
+ inp_attn = build_attn_inp_no_cache();
23
+ } else {
24
+ inp_attn = build_attn_inp_kv();
25
+ }
18
26
 
19
27
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
20
28
 
@@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
145
153
  cb(cur, "result_norm", -1);
146
154
  res->t_embd = cur;
147
155
 
148
- // lm_head
149
- cur = build_lora_mm(model.output, cur);
156
+ if constexpr (!embed) {
157
+ // lm_head
158
+ cur = build_lora_mm(model.output, cur);
150
159
 
151
- cb(cur, "result_output", -1);
152
- res->t_logits = cur;
160
+ cb(cur, "result_output", -1);
161
+ res->t_logits = cur;
162
+ }
153
163
 
154
164
  ggml_build_forward_expand(gf, cur);
155
165
  }
166
+
167
+ template struct llm_build_llama<false>;
168
+ template struct llm_build_llama<true>;