@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -3,6 +3,7 @@
3
3
  #include "ggml.h" // ggml_op
4
4
 
5
5
  #include <string>
6
+ #include <set>
6
7
 
7
8
  //
8
9
  // gguf constants (sync with gguf.py)
@@ -79,6 +80,7 @@ enum llm_arch {
79
80
  LLM_ARCH_JAIS,
80
81
  LLM_ARCH_NEMOTRON,
81
82
  LLM_ARCH_NEMOTRON_H,
83
+ LLM_ARCH_NEMOTRON_H_MOE,
82
84
  LLM_ARCH_EXAONE,
83
85
  LLM_ARCH_EXAONE4,
84
86
  LLM_ARCH_RWKV6,
@@ -315,6 +317,7 @@ enum llm_tensor {
315
317
  LLM_TENSOR_DENSE_3_OUT,
316
318
  LLM_TENSOR_OUTPUT,
317
319
  LLM_TENSOR_OUTPUT_NORM,
320
+ LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
318
321
  LLM_TENSOR_ROPE_FREQS,
319
322
  LLM_TENSOR_ROPE_FACTORS_LONG,
320
323
  LLM_TENSOR_ROPE_FACTORS_SHORT,
@@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
525
528
  const int bid;
526
529
  const int xid;
527
530
 
531
+ const std::set<llm_tensor> model_tensors;
532
+
533
+ LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
534
+
528
535
  std::string str() const;
529
536
 
530
537
  operator std::string() const {
@@ -546,11 +553,11 @@ struct LLM_TN {
546
553
  llm_arch arch;
547
554
 
548
555
  LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
549
- return { arch, tensor, suffix, bid, xid };
556
+ return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
550
557
  }
551
558
 
552
559
  LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
553
- return { arch, tensor, nullptr, bid, xid };
560
+ return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
554
561
  }
555
562
  };
556
563
 
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
695
695
  udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
696
696
  udata->output .resize(n_tokens);
697
697
 
698
+ udata->seq_id_data.reserve(n_tokens);
699
+
698
700
  seq_set_t seq_set_unq;
699
701
 
700
702
  for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
716
718
  }
717
719
 
718
720
  udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
719
- udata->seq_id[i] = batch.seq_id[idxs[i]];
720
721
  udata->output[i] = batch.logits[idxs[i]];
721
722
 
722
723
  for (int s = 0; s < udata->n_seq_id[i]; ++s) {
723
- seq_set_unq.set(udata->seq_id[i][s]);
724
+ const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
725
+
726
+ udata->seq_id_data.push_back(seq_id);
727
+ seq_set_unq.set(seq_id);
724
728
  }
725
729
 
726
730
  if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
728
732
  }
729
733
  }
730
734
 
735
+ llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
736
+ for (size_t i = 0; i < idxs.size(); ++i) {
737
+ udata->seq_id[i] = seq_id_ptr;
738
+ seq_id_ptr += udata->n_seq_id[i];
739
+ }
740
+
731
741
  for (uint32_t s = 0; s < n_seq_max; ++s) {
732
742
  if (seq_set_unq.test(s)) {
733
743
  udata->seq_idx[s] = udata->seq_id_unq.size();
@@ -56,13 +56,15 @@ struct llama_ubatch {
56
56
  std::vector<float> embd;
57
57
  std::vector<llama_pos> pos;
58
58
  std::vector<int32_t> n_seq_id;
59
- std::vector<llama_seq_id *> seq_id;
59
+ std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
60
60
  std::vector<llama_seq_id> seq_id_unq;
61
61
  std::vector<int32_t> seq_idx;
62
62
  std::vector<int8_t> output;
63
+
64
+ std::vector<llama_seq_id> seq_id_data;
63
65
  };
64
66
 
65
- // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
67
+ // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
66
68
  std::shared_ptr<data_t> data;
67
69
  };
68
70
 
@@ -9,6 +9,7 @@
9
9
  #include "llama-model.h"
10
10
 
11
11
  #include <cinttypes>
12
+ #include <cmath>
12
13
  #include <cstring>
13
14
  #include <limits>
14
15
  #include <stdexcept>
@@ -72,6 +73,43 @@ llama_context::llama_context(
72
73
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
73
74
  }
74
75
 
76
+ if (cparams.yarn_ext_factor != 0) {
77
+ static auto get_mscale = [](float scale, float mscale) {
78
+ return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
79
+ };
80
+
81
+ const float factor = 1.0f / cparams.rope_freq_scale;
82
+
83
+ // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
84
+ if (hparams.rope_yarn_log_mul != 0.0f) {
85
+ // note: here we assume `mscale == 1.0f`
86
+ // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
87
+ float mscale = 1.0f;
88
+ const float mscale_all_dims = hparams.rope_yarn_log_mul;
89
+
90
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
91
+ // special-case DEEPSEEK v2:
92
+ // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
93
+ if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
94
+ mscale = mscale_all_dims;
95
+ }
96
+
97
+ cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
98
+
99
+ LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
100
+ __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
101
+ } else {
102
+ cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
103
+ }
104
+
105
+ // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
106
+ // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
107
+ //
108
+ // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
109
+ // https://github.com/ggml-org/llama.cpp/pull/17945
110
+ cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
111
+ }
112
+
75
113
  cparams.yarn_attn_factor *= hparams.rope_attn_factor;
76
114
 
77
115
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -93,14 +131,6 @@ llama_context::llama_context(
93
131
  // with causal attention, the batch size is limited by the context size
94
132
  cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
95
133
 
96
- // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
97
- // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
98
- // ref: https://github.com/ggerganov/llama.cpp/pull/5021
99
- // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
100
- if (cparams.n_batch < GGML_KQ_MASK_PAD) {
101
- LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
102
- cparams.n_batch = GGML_KQ_MASK_PAD;
103
- }
104
134
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
105
135
 
106
136
  cparams.op_offload = params.op_offload;
@@ -228,6 +258,7 @@ llama_context::llama_context(
228
258
 
229
259
  backend_buft.clear();
230
260
  backend_ptrs.clear();
261
+ backend_buf_exp_size.clear();
231
262
 
232
263
  for (auto & backend : backends) {
233
264
  auto * buft = ggml_backend_get_default_buffer_type(backend.get());
@@ -244,11 +275,15 @@ llama_context::llama_context(
244
275
 
245
276
  backend_buft.push_back(buft);
246
277
  backend_ptrs.push_back(backend.get());
278
+ backend_buf_exp_size.push_back(0);
247
279
  }
248
280
 
249
281
  LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
250
282
 
251
- const size_t max_nodes = this->graph_max_nodes();
283
+ const uint32_t n_seqs = cparams.n_seq_max;
284
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
285
+
286
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
252
287
 
253
288
  LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
254
289
 
@@ -300,9 +335,6 @@ llama_context::llama_context(
300
335
 
301
336
  cross.v_embd.clear();
302
337
 
303
- const uint32_t n_seqs = cparams.n_seq_max;
304
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
305
-
306
338
  // avoid reserving graphs with zero outputs - assume one output per sequence
307
339
  n_outputs = n_seqs;
308
340
 
@@ -359,7 +391,8 @@ llama_context::llama_context(
359
391
 
360
392
  // reserve pp (prompt processing) graph first so that buffers are only allocated once
361
393
  {
362
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
394
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
395
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
363
396
  if (!gf) {
364
397
  if (pipeline_parallel) {
365
398
  LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
@@ -377,7 +410,7 @@ llama_context::llama_context(
377
410
 
378
411
  // reserve with tg (token generation) graph to get the number of splits and nodes
379
412
  {
380
- auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
413
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
381
414
  if (!gf) {
382
415
  throw std::runtime_error("failed to allocate compute tg buffers");
383
416
  }
@@ -392,7 +425,7 @@ llama_context::llama_context(
392
425
  //
393
426
  // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
394
427
  //
395
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
428
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
396
429
  if (!gf) {
397
430
  throw std::runtime_error("failed to allocate compute pp buffers");
398
431
  }
@@ -401,11 +434,13 @@ llama_context::llama_context(
401
434
  for (size_t i = 0; i < backend_ptrs.size(); ++i) {
402
435
  ggml_backend_t backend = backend_ptrs[i];
403
436
  ggml_backend_buffer_type_t buft = backend_buft[i];
404
- size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
405
- if (size > 1) {
437
+ if (!model.hparams.no_alloc) {
438
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
439
+ }
440
+ if (backend_buf_exp_size[i] > 1) {
406
441
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
407
442
  ggml_backend_buft_name(buft),
408
- size / 1024.0 / 1024.0);
443
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
409
444
  }
410
445
  }
411
446
 
@@ -424,6 +459,23 @@ llama_context::llama_context(
424
459
  }
425
460
 
426
461
  llama_context::~llama_context() {
462
+ // FIXME this currently results in a use-after-free bug if the model is freed before the context
463
+ // if (!model.hparams.no_alloc) {
464
+ // for (size_t i = 0; i < backend_ptrs.size(); ++i) {
465
+ // ggml_backend_t backend = backend_ptrs[i];
466
+ // ggml_backend_buffer_type_t buft = backend_buft[i];
467
+
468
+ // const size_t size_exp = backend_buf_exp_size[i];
469
+ // const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
470
+ // if (size_exp == size_act) {
471
+ // LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
472
+ // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
473
+ // } else {
474
+ // LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
475
+ // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
476
+ // }
477
+ // }
478
+ // }
427
479
  ggml_opt_free(opt_ctx);
428
480
  }
429
481
 
@@ -1326,6 +1378,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1326
1378
  // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
1327
1379
  LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
1328
1380
  #endif
1381
+ synchronize();
1329
1382
  buf_output = nullptr;
1330
1383
  logits = nullptr;
1331
1384
  embd = nullptr;
@@ -1386,9 +1439,9 @@ void llama_context::output_reorder() {
1386
1439
  // graph
1387
1440
  //
1388
1441
 
1389
- uint32_t llama_context::graph_max_nodes() const {
1442
+ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
1390
1443
  if (model.arch == LLM_ARCH_QWEN3NEXT) {
1391
- return std::max<uint32_t>(8192u, 32u*model.n_tensors());
1444
+ return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
1392
1445
  }
1393
1446
  return std::max<uint32_t>(1024u, 8u*model.n_tensors());
1394
1447
  }
@@ -1397,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
1397
1450
  return static_cast<llm_graph_result *>(gf_res_reserve.get());
1398
1451
  }
1399
1452
 
1400
- ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
1453
+ ggml_cgraph * llama_context::graph_reserve(
1454
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
1401
1455
  LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
1402
1456
  GGML_ASSERT(n_outputs >= 1);
1403
1457
 
@@ -1434,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1434
1488
 
1435
1489
  // initialize scheduler with the specified graph
1436
1490
  if (split_only) {
1437
- ggml_backend_sched_split_graph(sched.get(), gf);
1491
+ if (sizes) {
1492
+ ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
1493
+ } else {
1494
+ ggml_backend_sched_split_graph(sched.get(), gf);
1495
+ }
1438
1496
  } else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
1497
+ GGML_ASSERT(!sizes);
1439
1498
  LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
1440
1499
  return nullptr;
1441
1500
  }
@@ -2057,15 +2116,26 @@ void llama_context::perf_reset() {
2057
2116
 
2058
2117
  std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2059
2118
  std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2060
- for (const auto & buft_size : model.memory_breakdown()) {
2061
- ret[buft_size.first].model += buft_size.second;
2119
+ for (const auto & [buft, size] : model.memory_breakdown()) {
2120
+ ret[buft].model += size;
2062
2121
  }
2063
- for (const auto & buft_size : memory->memory_breakdown()) {
2064
- ret[buft_size.first].context += buft_size.second;
2122
+ if (memory) {
2123
+ for (const auto & [buft, size] : memory->memory_breakdown()) {
2124
+ ret[buft].context += size;
2125
+ }
2065
2126
  }
2066
- for (const auto & backend_ptr : backends) {
2067
- ggml_backend_t backend = backend_ptr.get();
2068
- ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2127
+ if (model.hparams.no_alloc) {
2128
+ for (size_t i = 0; i < backends.size(); ++i) {
2129
+ ggml_backend_t backend = backends[i].get();
2130
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
2131
+ ret[buft].compute += backend_buf_exp_size[i];
2132
+ }
2133
+ } else {
2134
+ for (const auto & backend_ptr : backends) {
2135
+ ggml_backend_t backend = backend_ptr.get();
2136
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
2137
+ ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2138
+ }
2069
2139
  }
2070
2140
  return ret;
2071
2141
  }
@@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
26
26
  size_t model = 0; // memory allocated for the model
27
27
  size_t context = 0; // memory allocated for the context
28
28
  size_t compute = 0; // memory allocated for temporary compute buffers
29
+
30
+ size_t total() const {
31
+ return model + context + compute;
32
+ }
29
33
  };
30
34
 
31
35
  struct llama_context {
@@ -197,7 +201,7 @@ private:
197
201
  //
198
202
 
199
203
  public:
200
- uint32_t graph_max_nodes() const;
204
+ uint32_t graph_max_nodes(uint32_t n_tokens) const;
201
205
 
202
206
  // can reuse the llm_graph_result instance of the context (for example to update a memory module)
203
207
  llm_graph_result * get_gf_res_reserve() const;
@@ -206,7 +210,8 @@ public:
206
210
  ggml_status graph_compute(ggml_cgraph * gf, bool batched);
207
211
 
208
212
  // reserve a graph with a dummy ubatch of the specified size
209
- ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
213
+ ggml_cgraph * graph_reserve(
214
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
210
215
 
211
216
  private:
212
217
  llm_graph_params graph_params(
@@ -281,9 +286,10 @@ private:
281
286
 
282
287
  std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
283
288
 
284
- // buffer types used for the compute buffer of each backend
289
+ // pointers and buffer types used for the compute buffer of each backend
285
290
  std::vector<ggml_backend_t> backend_ptrs;
286
291
  std::vector<ggml_backend_buffer_type_t> backend_buft;
292
+ std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
287
293
 
288
294
  llm_graph_result_ptr gf_res_prev;
289
295
  llm_graph_result_ptr gf_res_reserve;