@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -3,6 +3,7 @@
3
3
  #include "ggml.h" // ggml_op
4
4
 
5
5
  #include <string>
6
+ #include <set>
6
7
 
7
8
  //
8
9
  // gguf constants (sync with gguf.py)
@@ -79,6 +80,7 @@ enum llm_arch {
79
80
  LLM_ARCH_JAIS,
80
81
  LLM_ARCH_NEMOTRON,
81
82
  LLM_ARCH_NEMOTRON_H,
83
+ LLM_ARCH_NEMOTRON_H_MOE,
82
84
  LLM_ARCH_EXAONE,
83
85
  LLM_ARCH_EXAONE4,
84
86
  LLM_ARCH_RWKV6,
@@ -315,6 +317,7 @@ enum llm_tensor {
315
317
  LLM_TENSOR_DENSE_3_OUT,
316
318
  LLM_TENSOR_OUTPUT,
317
319
  LLM_TENSOR_OUTPUT_NORM,
320
+ LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
318
321
  LLM_TENSOR_ROPE_FREQS,
319
322
  LLM_TENSOR_ROPE_FACTORS_LONG,
320
323
  LLM_TENSOR_ROPE_FACTORS_SHORT,
@@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
525
528
  const int bid;
526
529
  const int xid;
527
530
 
531
+ const std::set<llm_tensor> model_tensors;
532
+
533
+ LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
534
+
528
535
  std::string str() const;
529
536
 
530
537
  operator std::string() const {
@@ -546,11 +553,11 @@ struct LLM_TN {
546
553
  llm_arch arch;
547
554
 
548
555
  LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
549
- return { arch, tensor, suffix, bid, xid };
556
+ return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
550
557
  }
551
558
 
552
559
  LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
553
- return { arch, tensor, nullptr, bid, xid };
560
+ return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
554
561
  }
555
562
  };
556
563
 
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
695
695
  udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
696
696
  udata->output .resize(n_tokens);
697
697
 
698
+ udata->seq_id_data.reserve(n_tokens);
699
+
698
700
  seq_set_t seq_set_unq;
699
701
 
700
702
  for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
716
718
  }
717
719
 
718
720
  udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
719
- udata->seq_id[i] = batch.seq_id[idxs[i]];
720
721
  udata->output[i] = batch.logits[idxs[i]];
721
722
 
722
723
  for (int s = 0; s < udata->n_seq_id[i]; ++s) {
723
- seq_set_unq.set(udata->seq_id[i][s]);
724
+ const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
725
+
726
+ udata->seq_id_data.push_back(seq_id);
727
+ seq_set_unq.set(seq_id);
724
728
  }
725
729
 
726
730
  if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
728
732
  }
729
733
  }
730
734
 
735
+ llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
736
+ for (size_t i = 0; i < idxs.size(); ++i) {
737
+ udata->seq_id[i] = seq_id_ptr;
738
+ seq_id_ptr += udata->n_seq_id[i];
739
+ }
740
+
731
741
  for (uint32_t s = 0; s < n_seq_max; ++s) {
732
742
  if (seq_set_unq.test(s)) {
733
743
  udata->seq_idx[s] = udata->seq_id_unq.size();
@@ -56,13 +56,15 @@ struct llama_ubatch {
56
56
  std::vector<float> embd;
57
57
  std::vector<llama_pos> pos;
58
58
  std::vector<int32_t> n_seq_id;
59
- std::vector<llama_seq_id *> seq_id;
59
+ std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
60
60
  std::vector<llama_seq_id> seq_id_unq;
61
61
  std::vector<int32_t> seq_idx;
62
62
  std::vector<int8_t> output;
63
+
64
+ std::vector<llama_seq_id> seq_id_data;
63
65
  };
64
66
 
65
- // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
67
+ // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
66
68
  std::shared_ptr<data_t> data;
67
69
  };
68
70
 
@@ -9,6 +9,7 @@
9
9
  #include "llama-model.h"
10
10
 
11
11
  #include <cinttypes>
12
+ #include <cmath>
12
13
  #include <cstring>
13
14
  #include <limits>
14
15
  #include <stdexcept>
@@ -72,6 +73,43 @@ llama_context::llama_context(
72
73
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
73
74
  }
74
75
 
76
+ if (cparams.yarn_ext_factor != 0) {
77
+ static auto get_mscale = [](float scale, float mscale) {
78
+ return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
79
+ };
80
+
81
+ const float factor = 1.0f / cparams.rope_freq_scale;
82
+
83
+ // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
84
+ if (hparams.rope_yarn_log_mul != 0.0f) {
85
+ // note: here we assume `mscale == 1.0f`
86
+ // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
87
+ float mscale = 1.0f;
88
+ const float mscale_all_dims = hparams.rope_yarn_log_mul;
89
+
90
+ // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
91
+ // special-case DEEPSEEK v2:
92
+ // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
93
+ if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
94
+ mscale = mscale_all_dims;
95
+ }
96
+
97
+ cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
98
+
99
+ LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
100
+ __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
101
+ } else {
102
+ cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
103
+ }
104
+
105
+ // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
106
+ // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
107
+ //
108
+ // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
109
+ // https://github.com/ggml-org/llama.cpp/pull/17945
110
+ cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
111
+ }
112
+
75
113
  cparams.yarn_attn_factor *= hparams.rope_attn_factor;
76
114
 
77
115
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -93,14 +131,6 @@ llama_context::llama_context(
93
131
  // with causal attention, the batch size is limited by the context size
94
132
  cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
95
133
 
96
- // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
97
- // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
98
- // ref: https://github.com/ggerganov/llama.cpp/pull/5021
99
- // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
100
- if (cparams.n_batch < GGML_KQ_MASK_PAD) {
101
- LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
102
- cparams.n_batch = GGML_KQ_MASK_PAD;
103
- }
104
134
  cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
105
135
 
106
136
  cparams.op_offload = params.op_offload;
@@ -228,6 +258,7 @@ llama_context::llama_context(
228
258
 
229
259
  backend_buft.clear();
230
260
  backend_ptrs.clear();
261
+ backend_buf_exp_size.clear();
231
262
 
232
263
  for (auto & backend : backends) {
233
264
  auto * buft = ggml_backend_get_default_buffer_type(backend.get());
@@ -244,6 +275,7 @@ llama_context::llama_context(
244
275
 
245
276
  backend_buft.push_back(buft);
246
277
  backend_ptrs.push_back(backend.get());
278
+ backend_buf_exp_size.push_back(0);
247
279
  }
248
280
 
249
281
  LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
@@ -359,7 +391,8 @@ llama_context::llama_context(
359
391
 
360
392
  // reserve pp (prompt processing) graph first so that buffers are only allocated once
361
393
  {
362
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
394
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
395
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
363
396
  if (!gf) {
364
397
  if (pipeline_parallel) {
365
398
  LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
@@ -377,7 +410,7 @@ llama_context::llama_context(
377
410
 
378
411
  // reserve with tg (token generation) graph to get the number of splits and nodes
379
412
  {
380
- auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
413
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
381
414
  if (!gf) {
382
415
  throw std::runtime_error("failed to allocate compute tg buffers");
383
416
  }
@@ -392,7 +425,7 @@ llama_context::llama_context(
392
425
  //
393
426
  // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
394
427
  //
395
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
428
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
396
429
  if (!gf) {
397
430
  throw std::runtime_error("failed to allocate compute pp buffers");
398
431
  }
@@ -401,11 +434,13 @@ llama_context::llama_context(
401
434
  for (size_t i = 0; i < backend_ptrs.size(); ++i) {
402
435
  ggml_backend_t backend = backend_ptrs[i];
403
436
  ggml_backend_buffer_type_t buft = backend_buft[i];
404
- size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
405
- if (size > 1) {
437
+ if (!model.hparams.no_alloc) {
438
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
439
+ }
440
+ if (backend_buf_exp_size[i] > 1) {
406
441
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
407
442
  ggml_backend_buft_name(buft),
408
- size / 1024.0 / 1024.0);
443
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
409
444
  }
410
445
  }
411
446
 
@@ -424,6 +459,23 @@ llama_context::llama_context(
424
459
  }
425
460
 
426
461
  llama_context::~llama_context() {
462
+ // FIXME this currently results in a use-after-free bug if the model is freed before the context
463
+ // if (!model.hparams.no_alloc) {
464
+ // for (size_t i = 0; i < backend_ptrs.size(); ++i) {
465
+ // ggml_backend_t backend = backend_ptrs[i];
466
+ // ggml_backend_buffer_type_t buft = backend_buft[i];
467
+
468
+ // const size_t size_exp = backend_buf_exp_size[i];
469
+ // const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
470
+ // if (size_exp == size_act) {
471
+ // LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
472
+ // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
473
+ // } else {
474
+ // LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
475
+ // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
476
+ // }
477
+ // }
478
+ // }
427
479
  ggml_opt_free(opt_ctx);
428
480
  }
429
481
 
@@ -1326,6 +1378,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
1326
1378
  // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
1327
1379
  LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
1328
1380
  #endif
1381
+ synchronize();
1329
1382
  buf_output = nullptr;
1330
1383
  logits = nullptr;
1331
1384
  embd = nullptr;
@@ -1397,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
1397
1450
  return static_cast<llm_graph_result *>(gf_res_reserve.get());
1398
1451
  }
1399
1452
 
1400
- ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
1453
+ ggml_cgraph * llama_context::graph_reserve(
1454
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
1401
1455
  LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
1402
1456
  GGML_ASSERT(n_outputs >= 1);
1403
1457
 
@@ -1434,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1434
1488
 
1435
1489
  // initialize scheduler with the specified graph
1436
1490
  if (split_only) {
1437
- ggml_backend_sched_split_graph(sched.get(), gf);
1491
+ if (sizes) {
1492
+ ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
1493
+ } else {
1494
+ ggml_backend_sched_split_graph(sched.get(), gf);
1495
+ }
1438
1496
  } else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
1497
+ GGML_ASSERT(!sizes);
1439
1498
  LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
1440
1499
  return nullptr;
1441
1500
  }
@@ -2057,15 +2116,26 @@ void llama_context::perf_reset() {
2057
2116
 
2058
2117
  std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2059
2118
  std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2060
- for (const auto & buft_size : model.memory_breakdown()) {
2061
- ret[buft_size.first].model += buft_size.second;
2119
+ for (const auto & [buft, size] : model.memory_breakdown()) {
2120
+ ret[buft].model += size;
2062
2121
  }
2063
- for (const auto & buft_size : memory->memory_breakdown()) {
2064
- ret[buft_size.first].context += buft_size.second;
2122
+ if (memory) {
2123
+ for (const auto & [buft, size] : memory->memory_breakdown()) {
2124
+ ret[buft].context += size;
2125
+ }
2065
2126
  }
2066
- for (const auto & backend_ptr : backends) {
2067
- ggml_backend_t backend = backend_ptr.get();
2068
- ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2127
+ if (model.hparams.no_alloc) {
2128
+ for (size_t i = 0; i < backends.size(); ++i) {
2129
+ ggml_backend_t backend = backends[i].get();
2130
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
2131
+ ret[buft].compute += backend_buf_exp_size[i];
2132
+ }
2133
+ } else {
2134
+ for (const auto & backend_ptr : backends) {
2135
+ ggml_backend_t backend = backend_ptr.get();
2136
+ ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
2137
+ ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2138
+ }
2069
2139
  }
2070
2140
  return ret;
2071
2141
  }
@@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
26
26
  size_t model = 0; // memory allocated for the model
27
27
  size_t context = 0; // memory allocated for the context
28
28
  size_t compute = 0; // memory allocated for temporary compute buffers
29
+
30
+ size_t total() const {
31
+ return model + context + compute;
32
+ }
29
33
  };
30
34
 
31
35
  struct llama_context {
@@ -206,7 +210,8 @@ public:
206
210
  ggml_status graph_compute(ggml_cgraph * gf, bool batched);
207
211
 
208
212
  // reserve a graph with a dummy ubatch of the specified size
209
- ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
213
+ ggml_cgraph * graph_reserve(
214
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
210
215
 
211
216
  private:
212
217
  llm_graph_params graph_params(
@@ -281,9 +286,10 @@ private:
281
286
 
282
287
  std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
283
288
 
284
- // buffer types used for the compute buffer of each backend
289
+ // pointers and buffer types used for the compute buffer of each backend
285
290
  std::vector<ggml_backend_t> backend_ptrs;
286
291
  std::vector<ggml_backend_buffer_type_t> backend_buft;
292
+ std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
287
293
 
288
294
  llm_graph_result_ptr gf_res_prev;
289
295
  llm_graph_result_ptr gf_res_reserve;
@@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
78
78
  for (int i = 0; i < n_tokens; ++i) {
79
79
  const float pos = ubatch->pos[i];
80
80
  attn_scale_data[i] = std::log(
81
- std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
81
+ std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
82
82
  ) * f_attn_temp_scale + 1.0;
83
83
  }
84
84
 
@@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
254
254
  }
255
255
  }
256
256
 
257
+ bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
258
+ const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
259
+
260
+ this->mctx = mctx;
261
+
262
+ bool res = true;
263
+
264
+ res &= s_copy->ne[0] == mctx->get_n_rs();
265
+
266
+ res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
267
+ res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
268
+
269
+ res &= head == mctx->get_head();
270
+ res &= rs_z == mctx->get_rs_z();
271
+
272
+ return res;
273
+ }
274
+
257
275
  void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
258
276
  GGML_UNUSED(ubatch);
259
277
 
@@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
385
403
  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
386
404
 
387
405
  res &= self_kq_mask->ne[0] == mctx->get_n_kv();
388
- res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
406
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
389
407
 
390
408
  return res;
391
409
  }
@@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
416
434
  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
417
435
 
418
436
  res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
419
- res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
437
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
420
438
 
421
439
  res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
422
- res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
440
+ res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
423
441
 
424
442
  return res;
425
443
  }
@@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
452
470
  }
453
471
  }
454
472
 
455
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
473
+ for (int i = n_tokens; i < n_tokens; ++i) {
456
474
  for (int j = 0; j < n_enc; ++j) {
457
475
  data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
458
476
  }
@@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
461
479
  }
462
480
 
463
481
  void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
464
- inp_attn->set_input(ubatch);
465
- inp_rs->set_input(ubatch);
482
+ mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
483
+ mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
484
+
485
+ mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
486
+
487
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
488
+
489
+ if (inp_rs->s_copy) {
490
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
491
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
492
+
493
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
494
+ for (uint32_t i = 0; i < n_rs; ++i) {
495
+ data[i] = mctx->get_recr()->s_copy(i);
496
+ }
497
+ }
498
+ }
499
+
500
+ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
501
+ const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
502
+
503
+ this->mctx = mctx;
504
+
505
+ bool res = true;
506
+
507
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
508
+ //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
509
+
510
+ res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
511
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
512
+
513
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
514
+
515
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
516
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
517
+
518
+ res &= inp_rs->head == mctx->get_recr()->get_head();
519
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
520
+
521
+ return res;
466
522
  }
467
523
 
468
524
  //
@@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1089
1145
  cur = ggml_relu(ctx0, cur);
1090
1146
  cb(cur, "ffn_moe_relu", il);
1091
1147
  } break;
1148
+ case LLM_FFN_RELU_SQR:
1149
+ if (gate_exps) {
1150
+ // TODO: add support for gated squared relu
1151
+ GGML_ABORT("fatal error: gated squared relu not implemented");
1152
+ } else {
1153
+ cur = ggml_relu(ctx0, cur);
1154
+ cur = ggml_sqr(ctx0, cur);
1155
+ cb(cur, "ffn_moe_relu_sqr", il);
1156
+ } break;
1092
1157
  default:
1093
1158
  GGML_ABORT("fatal error");
1094
1159
  }
@@ -1203,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
1203
1268
  }
1204
1269
 
1205
1270
  ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1206
- auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1271
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
1207
1272
 
1208
1273
  auto & cur = inp->attn_scale;
1209
1274
 
@@ -1470,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
1470
1535
  auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1471
1536
 
1472
1537
  // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1473
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1538
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
1474
1539
  ggml_set_input(inp->self_kq_mask);
1475
1540
 
1476
1541
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1477
1542
 
1478
1543
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
1479
- inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1544
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
1480
1545
  ggml_set_input(inp->self_kq_mask_swa);
1481
1546
 
1482
1547
  inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1558,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
1558
1623
  inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
1559
1624
  inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
1560
1625
 
1561
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1626
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1562
1627
  ggml_set_input(inp->self_kq_mask);
1563
1628
 
1564
1629
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1701,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
1701
1766
 
1702
1767
  const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1703
1768
 
1704
- inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1769
+ inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
1705
1770
  ggml_set_input(inp->cross_kq_mask);
1706
1771
 
1707
1772
  inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1767,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1767
1832
  inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
1768
1833
  inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
1769
1834
 
1770
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1835
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1771
1836
  ggml_set_input(inp->self_kq_mask);
1772
1837
 
1773
1838
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1781,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1781
1846
  inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
1782
1847
  inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
1783
1848
 
1784
- inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1849
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1785
1850
  ggml_set_input(inp->self_kq_mask_swa);
1786
1851
 
1787
1852
  inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
1841
1906
  inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
1842
1907
  inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
1843
1908
 
1909
+ inp->head = mctx_cur->get_head();
1910
+ inp->rs_z = mctx_cur->get_rs_z();
1911
+
1844
1912
  return inp;
1845
1913
  }
1846
1914
 
@@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1909
1977
  llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1910
1978
  const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
1911
1979
 
1912
- auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
1980
+ auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
1913
1981
  auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
1914
1982
 
1915
- auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
1983
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
1916
1984
 
1917
1985
  return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
1918
1986
  }
@@ -132,8 +132,8 @@ public:
132
132
  // temperature tuning, used by llama4
133
133
  class llm_graph_input_attn_temp : public llm_graph_input_i {
134
134
  public:
135
- llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
136
- : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
135
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
136
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
137
137
  virtual ~llm_graph_input_attn_temp() = default;
138
138
 
139
139
  void set_input(const llama_ubatch * ubatch) override;
@@ -142,6 +142,7 @@ public:
142
142
 
143
143
  const uint32_t n_attn_temp_floor_scale;
144
144
  const float f_attn_temp_scale;
145
+ const float f_attn_temp_offset;
145
146
  };
146
147
 
147
148
  class llm_graph_input_pos_bucket : public llm_graph_input_i {
@@ -224,6 +225,8 @@ public:
224
225
 
225
226
  void set_input(const llama_ubatch * ubatch) override;
226
227
 
228
+ bool can_reuse(const llm_graph_params & params) override;
229
+
227
230
  ggml_tensor * s_copy; // I32 [n_rs]
228
231
 
229
232
  // views of s_copy, computed once per graph
@@ -232,6 +235,10 @@ public:
232
235
  ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
233
236
 
234
237
  const llama_memory_recurrent_context * mctx;
238
+
239
+ // used in view offsets, need to match for valid graph reuse
240
+ uint32_t head;
241
+ int32_t rs_z;
235
242
  };
236
243
 
237
244
  class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -364,22 +371,28 @@ public:
364
371
  class llm_graph_input_mem_hybrid : public llm_graph_input_i {
365
372
  public:
366
373
  llm_graph_input_mem_hybrid(
374
+ const llama_cparams & cparams,
367
375
  std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
368
- std::unique_ptr<llm_graph_input_rs> inp_rs,
369
- const llama_memory_hybrid_context * mctx) :
376
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
377
+ const llama_memory_hybrid_context * mctx) :
370
378
  inp_attn(std::move(inp_attn)),
371
379
  inp_rs(std::move(inp_rs)),
380
+ cparams(cparams),
372
381
  mctx(mctx) { }
373
382
  virtual ~llm_graph_input_mem_hybrid() = default;
374
383
 
375
384
  void set_input(const llama_ubatch * ubatch) override;
376
385
 
386
+ bool can_reuse(const llm_graph_params & params) override;
387
+
377
388
  std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
378
389
  std::unique_ptr<llm_graph_input_rs> inp_rs;
379
390
 
380
391
  llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
381
392
  llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
382
393
 
394
+ const llama_cparams cparams;
395
+
383
396
  const llama_memory_hybrid_context * mctx;
384
397
  };
385
398
 
@@ -1,6 +1,8 @@
1
1
  #include "llama-hparams.h"
2
2
 
3
3
  #include "ggml.h"
4
+
5
+ #include <algorithm>
4
6
  #include <cassert>
5
7
 
6
8
  void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
@@ -229,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
229
231
 
230
232
  return false;
231
233
  }
234
+
235
+ bool llama_hparams::use_mrope() const {
236
+ return rope_sections[0] > 0 && rope_sections[1] > 0;
237
+ }