@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
78
78
  for (int i = 0; i < n_tokens; ++i) {
79
79
  const float pos = ubatch->pos[i];
80
80
  attn_scale_data[i] = std::log(
81
- std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
81
+ std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
82
82
  ) * f_attn_temp_scale + 1.0;
83
83
  }
84
84
 
@@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
254
254
  }
255
255
  }
256
256
 
257
+ bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
258
+ const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
259
+
260
+ this->mctx = mctx;
261
+
262
+ bool res = true;
263
+
264
+ res &= s_copy->ne[0] == mctx->get_n_rs();
265
+
266
+ res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
267
+ res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
268
+
269
+ res &= head == mctx->get_head();
270
+ res &= rs_z == mctx->get_rs_z();
271
+
272
+ return res;
273
+ }
274
+
257
275
  void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
258
276
  GGML_UNUSED(ubatch);
259
277
 
@@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
385
403
  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
386
404
 
387
405
  res &= self_kq_mask->ne[0] == mctx->get_n_kv();
388
- res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
406
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
389
407
 
390
408
  return res;
391
409
  }
@@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
416
434
  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
417
435
 
418
436
  res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
419
- res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
437
+ res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
420
438
 
421
439
  res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
422
- res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
440
+ res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
423
441
 
424
442
  return res;
425
443
  }
@@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
452
470
  }
453
471
  }
454
472
 
455
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
473
+ for (int i = n_tokens; i < n_tokens; ++i) {
456
474
  for (int j = 0; j < n_enc; ++j) {
457
475
  data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
458
476
  }
@@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
461
479
  }
462
480
 
463
481
  void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
464
- inp_attn->set_input(ubatch);
465
- inp_rs->set_input(ubatch);
482
+ mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
483
+ mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
484
+
485
+ mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
486
+
487
+ const int64_t n_rs = mctx->get_recr()->get_n_rs();
488
+
489
+ if (inp_rs->s_copy) {
490
+ GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
491
+ int32_t * data = (int32_t *) inp_rs->s_copy->data;
492
+
493
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
494
+ for (uint32_t i = 0; i < n_rs; ++i) {
495
+ data[i] = mctx->get_recr()->s_copy(i);
496
+ }
497
+ }
498
+ }
499
+
500
+ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
501
+ const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
502
+
503
+ this->mctx = mctx;
504
+
505
+ bool res = true;
506
+
507
+ res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
508
+ //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
509
+
510
+ res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
511
+ res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
512
+
513
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
514
+
515
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
516
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
517
+
518
+ res &= inp_rs->head == mctx->get_recr()->get_head();
519
+ res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
520
+
521
+ return res;
466
522
  }
467
523
 
468
524
  //
@@ -973,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
973
1029
 
974
1030
  // mask out the other groups
975
1031
  selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
976
- selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
1032
+ selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
977
1033
  selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
978
1034
  cb(selection_probs, "ffn_moe_probs_masked", il);
979
1035
  }
@@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1089
1145
  cur = ggml_relu(ctx0, cur);
1090
1146
  cb(cur, "ffn_moe_relu", il);
1091
1147
  } break;
1148
+ case LLM_FFN_RELU_SQR:
1149
+ if (gate_exps) {
1150
+ // TODO: add support for gated squared relu
1151
+ GGML_ABORT("fatal error: gated squared relu not implemented");
1152
+ } else {
1153
+ cur = ggml_relu(ctx0, cur);
1154
+ cur = ggml_sqr(ctx0, cur);
1155
+ cb(cur, "ffn_moe_relu_sqr", il);
1156
+ } break;
1092
1157
  default:
1093
1158
  GGML_ABORT("fatal error");
1094
1159
  }
@@ -1203,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
1203
1268
  }
1204
1269
 
1205
1270
  ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1206
- auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1271
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
1207
1272
 
1208
1273
  auto & cur = inp->attn_scale;
1209
1274
 
@@ -1470,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
1470
1535
  auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1471
1536
 
1472
1537
  // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1473
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1538
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
1474
1539
  ggml_set_input(inp->self_kq_mask);
1475
1540
 
1476
1541
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1477
1542
 
1478
1543
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
1479
- inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1544
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
1480
1545
  ggml_set_input(inp->self_kq_mask_swa);
1481
1546
 
1482
1547
  inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1558,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
1558
1623
  inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
1559
1624
  inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
1560
1625
 
1561
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1626
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1562
1627
  ggml_set_input(inp->self_kq_mask);
1563
1628
 
1564
1629
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1701,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
1701
1766
 
1702
1767
  const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1703
1768
 
1704
- inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1769
+ inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
1705
1770
  ggml_set_input(inp->cross_kq_mask);
1706
1771
 
1707
1772
  inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1767,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1767
1832
  inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
1768
1833
  inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
1769
1834
 
1770
- inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1835
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1771
1836
  ggml_set_input(inp->self_kq_mask);
1772
1837
 
1773
1838
  inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1781,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
1781
1846
  inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
1782
1847
  inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
1783
1848
 
1784
- inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
1849
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
1785
1850
  ggml_set_input(inp->self_kq_mask_swa);
1786
1851
 
1787
1852
  inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
1841
1906
  inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
1842
1907
  inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
1843
1908
 
1909
+ inp->head = mctx_cur->get_head();
1910
+ inp->rs_z = mctx_cur->get_rs_z();
1911
+
1844
1912
  return inp;
1845
1913
  }
1846
1914
 
@@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1909
1977
  llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1910
1978
  const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
1911
1979
 
1912
- auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
1980
+ auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
1913
1981
  auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
1914
1982
 
1915
- auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
1983
+ auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
1916
1984
 
1917
1985
  return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
1918
1986
  }
@@ -132,8 +132,8 @@ public:
132
132
  // temperature tuning, used by llama4
133
133
  class llm_graph_input_attn_temp : public llm_graph_input_i {
134
134
  public:
135
- llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
136
- : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
135
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
136
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
137
137
  virtual ~llm_graph_input_attn_temp() = default;
138
138
 
139
139
  void set_input(const llama_ubatch * ubatch) override;
@@ -142,6 +142,7 @@ public:
142
142
 
143
143
  const uint32_t n_attn_temp_floor_scale;
144
144
  const float f_attn_temp_scale;
145
+ const float f_attn_temp_offset;
145
146
  };
146
147
 
147
148
  class llm_graph_input_pos_bucket : public llm_graph_input_i {
@@ -224,6 +225,8 @@ public:
224
225
 
225
226
  void set_input(const llama_ubatch * ubatch) override;
226
227
 
228
+ bool can_reuse(const llm_graph_params & params) override;
229
+
227
230
  ggml_tensor * s_copy; // I32 [n_rs]
228
231
 
229
232
  // views of s_copy, computed once per graph
@@ -232,6 +235,10 @@ public:
232
235
  ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
233
236
 
234
237
  const llama_memory_recurrent_context * mctx;
238
+
239
+ // used in view offsets, need to match for valid graph reuse
240
+ uint32_t head;
241
+ int32_t rs_z;
235
242
  };
236
243
 
237
244
  class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -364,22 +371,28 @@ public:
364
371
  class llm_graph_input_mem_hybrid : public llm_graph_input_i {
365
372
  public:
366
373
  llm_graph_input_mem_hybrid(
374
+ const llama_cparams & cparams,
367
375
  std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
368
- std::unique_ptr<llm_graph_input_rs> inp_rs,
369
- const llama_memory_hybrid_context * mctx) :
376
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
377
+ const llama_memory_hybrid_context * mctx) :
370
378
  inp_attn(std::move(inp_attn)),
371
379
  inp_rs(std::move(inp_rs)),
380
+ cparams(cparams),
372
381
  mctx(mctx) { }
373
382
  virtual ~llm_graph_input_mem_hybrid() = default;
374
383
 
375
384
  void set_input(const llama_ubatch * ubatch) override;
376
385
 
386
+ bool can_reuse(const llm_graph_params & params) override;
387
+
377
388
  std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
378
389
  std::unique_ptr<llm_graph_input_rs> inp_rs;
379
390
 
380
391
  llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
381
392
  llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
382
393
 
394
+ const llama_cparams cparams;
395
+
383
396
  const llama_memory_hybrid_context * mctx;
384
397
  };
385
398
 
@@ -1,6 +1,8 @@
1
1
  #include "llama-hparams.h"
2
2
 
3
3
  #include "ggml.h"
4
+
5
+ #include <algorithm>
4
6
  #include <cassert>
5
7
 
6
8
  void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
@@ -229,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
229
231
 
230
232
  return false;
231
233
  }
234
+
235
+ bool llama_hparams::use_mrope() const {
236
+ return rope_sections[0] > 0 && rope_sections[1] > 0;
237
+ }
@@ -34,6 +34,7 @@ struct llama_hparams_convnext {
34
34
 
35
35
  struct llama_hparams {
36
36
  bool vocab_only;
37
+ bool no_alloc;
37
38
  bool rope_finetuned;
38
39
  bool use_par_res;
39
40
  bool swin_norm;
@@ -107,6 +108,7 @@ struct llama_hparams {
107
108
  float rope_freq_base_train_swa;
108
109
  float rope_freq_scale_train;
109
110
  float rope_freq_scale_train_swa;
111
+
110
112
  uint32_t n_ctx_orig_yarn;
111
113
  float rope_yarn_log_mul = 0.0f;
112
114
 
@@ -164,6 +166,7 @@ struct llama_hparams {
164
166
  uint32_t n_no_rope_layer_step = 4;
165
167
  uint32_t n_attn_temp_floor_scale = 0;
166
168
  float f_attn_temp_scale = 0.0f;
169
+ float f_attn_temp_offset = 0.0f; // offset position index
167
170
 
168
171
  // gemma3n altup
169
172
  uint32_t n_altup = 4; // altup_num_inputs
@@ -267,7 +270,8 @@ struct llama_hparams {
267
270
  // TODO: think of a better place for this function
268
271
  // TODO: pack the SWA params in a struct?
269
272
  static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
273
+
274
+ bool use_mrope() const;
270
275
  };
271
276
 
272
277
  static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
273
-
@@ -25,6 +25,10 @@ time_meas::~time_meas() {
25
25
  }
26
26
  }
27
27
 
28
+ void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
29
+ ggml_log_get(log_callback, user_data);
30
+ }
31
+
28
32
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
29
33
  ggml_log_set(log_callback, user_data);
30
34
  g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
@@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
175
175
 
176
176
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
177
177
  for (auto & [buft, ctx] : ctx_map) {
178
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
178
+ ggml_backend_buffer_t buf;
179
+ if (model.hparams.no_alloc) {
180
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
181
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
182
+ t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
183
+ }
184
+ } else {
185
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
186
+ }
179
187
  if (!buf) {
180
188
  throw std::runtime_error("failed to allocate buffer for kv cache");
181
189
  }
@@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
482
490
 
483
491
  std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
484
492
  std::map<ggml_backend_buffer_type_t, size_t> ret;
485
- for (const auto & [_, buf] : ctxs_bufs) {
486
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
493
+ for (const auto & [ctx, buf] : ctxs_bufs) {
494
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
495
+
496
+ if (hparams.no_alloc) {
497
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
498
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
499
+ } else {
500
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
501
+ ret[buft] += ggml_backend_buffer_get_size(buf.get());
502
+ }
487
503
  }
504
+
488
505
  return ret;
489
506
  }
490
507
 
@@ -1232,8 +1249,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
1232
1249
  GGML_ASSERT(n_tokens%n_stream == 0);
1233
1250
 
1234
1251
  // n_tps == n_tokens_per_stream
1235
- const int64_t n_tps = n_tokens/n_stream;
1236
- const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
1252
+ const int64_t n_tps = n_tokens/n_stream;
1237
1253
 
1238
1254
  std::fill(data, data + ggml_nelements(dst), -INFINITY);
1239
1255
 
@@ -1266,7 +1282,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
1266
1282
  const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1267
1283
  const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1268
1284
 
1269
- const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
1285
+ const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
1270
1286
 
1271
1287
  for (uint32_t j = 0; j < n_kv; ++j) {
1272
1288
  if (cells.is_empty(j)) {
@@ -1370,9 +1386,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
1370
1386
  float freq_scale) const {
1371
1387
  const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
1372
1388
 
1373
- const auto & yarn_ext_factor = cparams.yarn_ext_factor;
1374
- const auto & yarn_beta_fast = cparams.yarn_beta_fast;
1375
- const auto & yarn_beta_slow = cparams.yarn_beta_slow;
1389
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
1390
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
1391
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
1392
+ const auto & yarn_attn_factor = cparams.yarn_attn_factor;
1376
1393
 
1377
1394
  const auto & n_rot = hparams.n_rot;
1378
1395
  const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1383,12 +1400,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
1383
1400
  ? LLAMA_ROPE_TYPE_NEOX
1384
1401
  : hparams.rope_type;
1385
1402
 
1386
- // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
1387
- // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
1388
- const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
1389
- ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
1390
- : cparams.yarn_attn_factor;
1391
-
1392
1403
  ggml_tensor * tmp;
1393
1404
 
1394
1405
  if (ggml_is_quantized(cur->type)) {
@@ -1550,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
1550
1561
 
1551
1562
  const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
1552
1563
 
1564
+ slot_info sinfo;
1565
+
1553
1566
  bool res = true;
1554
- res = res && state_read_meta(io, strm, cell_count, seq_id);
1555
- res = res && state_read_data(io, strm, cell_count);
1567
+ res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
1568
+ res = res && state_read_data(io, strm, cell_count, sinfo);
1556
1569
 
1557
1570
  if (!res) {
1558
1571
  if (seq_id == -1) {
@@ -1691,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
1691
1704
  }
1692
1705
  }
1693
1706
 
1694
- bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
1707
+ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
1695
1708
  auto & cells = v_cells[strm];
1696
1709
  auto & head = v_heads[strm];
1697
1710
 
@@ -1728,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1728
1741
  ubatch.seq_id[i] = &dest_seq_id;
1729
1742
  }
1730
1743
 
1731
- const auto sinfo = find_slot(ubatch, true);
1744
+ sinfo = find_slot(ubatch, false);
1732
1745
  if (sinfo.empty()) {
1733
1746
  LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1734
1747
  return false;
@@ -1738,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1738
1751
  // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
1739
1752
  apply_ubatch(sinfo, ubatch);
1740
1753
 
1741
- const auto head_cur = sinfo.head();
1742
-
1743
- // keep the head at the old position because we will read the KV data into it in state_read_data()
1744
- head = head_cur;
1754
+ LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
1745
1755
 
1746
- LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
1747
-
1748
- // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
1749
- // Assume that this is one contiguous block of cells
1750
- GGML_ASSERT(head_cur + cell_count <= cells.size());
1751
- GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
1752
- GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
1753
- GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
1754
- GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
1756
+ // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
1757
+ GGML_ASSERT(sinfo.n_stream() == 1);
1758
+ GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
1759
+ for (uint32_t i = 0; i < cell_count; ++i) {
1760
+ const uint32_t idx = sinfo.idxs[0][i];
1761
+ GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
1762
+ GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
1763
+ }
1755
1764
  } else {
1756
1765
  // whole KV cache restore
1757
1766
 
@@ -1784,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
1784
1793
  }
1785
1794
  }
1786
1795
 
1796
+ // Create contiguous slot_info for whole cache restore
1797
+ sinfo.s0 = strm;
1798
+ sinfo.s1 = strm;
1799
+ sinfo.resize(1);
1800
+ sinfo.strm[0] = strm;
1801
+ sinfo.idxs[0].resize(cell_count);
1802
+ for (uint32_t i = 0; i < cell_count; ++i) {
1803
+ sinfo.idxs[0][i] = i;
1804
+ }
1805
+
1787
1806
  head = 0;
1788
1807
  }
1789
1808
 
1790
1809
  return true;
1791
1810
  }
1792
1811
 
1793
- bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
1812
+ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
1794
1813
  auto & cells = v_cells[strm];
1795
- auto & head = v_heads[strm];
1796
1814
 
1797
1815
  uint32_t v_trans;
1798
1816
  uint32_t n_layer;
@@ -1842,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1842
1860
  }
1843
1861
 
1844
1862
  if (cell_count) {
1845
- // Read and set the keys for the whole cell range
1846
- ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1863
+ if (sinfo.is_contiguous()) {
1864
+ // Fast path: contiguous cells, single memcpy
1865
+ ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
1866
+ } else {
1867
+ // Slow path: scatter to non-contiguous positions
1868
+ const void * src = io.read(cell_count * k_size_row);
1869
+ for (uint32_t i = 0; i < cell_count; ++i) {
1870
+ const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
1871
+ ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
1872
+ }
1873
+ }
1847
1874
  }
1848
1875
  }
1849
1876
 
@@ -1874,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1874
1901
  }
1875
1902
 
1876
1903
  if (cell_count) {
1877
- // Read and set the values for the whole cell range
1878
- ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1904
+ if (sinfo.is_contiguous()) {
1905
+ // Fast path: contiguous cells, single memcpy
1906
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
1907
+ } else {
1908
+ // Slow path: scatter to non-contiguous positions
1909
+ const void * src = io.read(cell_count * v_size_row);
1910
+ for (uint32_t i = 0; i < cell_count; ++i) {
1911
+ const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
1912
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
1913
+ }
1914
+ }
1879
1915
  }
1880
1916
  }
1881
1917
  } else {
@@ -1914,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
1914
1950
  }
1915
1951
 
1916
1952
  if (cell_count) {
1917
- // For each row in the transposed matrix, read the values for the whole cell range
1918
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1919
- const size_t dst_offset = (head + j * cells.size()) * v_size_el;
1920
- ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1953
+ if (sinfo.is_contiguous()) {
1954
+ // Fast path: contiguous cells
1955
+ const uint32_t h = sinfo.head();
1956
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1957
+ const size_t dst_offset = (h + j * cells.size()) * v_size_el;
1958
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1959
+ }
1960
+ } else {
1961
+ // Slow path: scatter to non-contiguous positions
1962
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1963
+ const void * src = io.read(cell_count * v_size_el);
1964
+ for (uint32_t i = 0; i < cell_count; ++i) {
1965
+ const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
1966
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
1967
+ }
1968
+ }
1921
1969
  }
1922
1970
  }
1923
1971
  }
@@ -72,6 +72,23 @@ public:
72
72
  void clear() {
73
73
  idxs.clear();
74
74
  }
75
+
76
+ // check if indices are contiguous starting from head()
77
+ bool is_contiguous() const {
78
+ if (idxs.empty() || idxs[0].empty()) {
79
+ return true;
80
+ }
81
+ if (idxs.size() > 1) {
82
+ return false;
83
+ }
84
+ const uint32_t h = idxs[0][0];
85
+ for (size_t i = 0; i < idxs[0].size(); ++i) {
86
+ if (idxs[0][i] != h + i) {
87
+ return false;
88
+ }
89
+ }
90
+ return true;
91
+ }
75
92
  };
76
93
 
77
94
  using slot_info_vec_t = std::vector<slot_info>;
@@ -264,8 +281,8 @@ private:
264
281
  void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
265
282
  void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
266
283
 
267
- bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
268
- bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
284
+ bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
285
+ bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
269
286
  };
270
287
 
271
288
  class llama_kv_cache_context : public llama_memory_context_i {
@@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
222
222
  ubatches(std::move(ubatches)),
223
223
  // note: here we copy the ubatches. not sure if this is ideal
224
224
  ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
225
- ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
225
+ ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
226
226
  status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
227
227
  }
228
228
 
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
473
473
  std::vector<std::string> & splits,
474
474
  bool use_mmap,
475
475
  bool check_tensors,
476
+ bool no_alloc,
476
477
  const llama_model_kv_override * param_overrides_p,
477
478
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
478
479
  int trace = 0;
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
716
717
 
717
718
  this->use_mmap = use_mmap;
718
719
  this->check_tensors = check_tensors;
720
+ this->no_alloc = no_alloc;
719
721
  }
720
722
 
721
723
  std::string llama_model_loader::get_arch_name() const {
@@ -71,6 +71,7 @@ struct llama_model_loader {
71
71
 
72
72
  bool use_mmap = false;
73
73
  bool check_tensors;
74
+ bool no_alloc;
74
75
 
75
76
  llama_files files;
76
77
  llama_ftype ftype;
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
98
99
  bool use_mmap,
99
100
  bool check_tensors,
101
+ bool no_alloc,
100
102
  const llama_model_kv_override * param_overrides_p,
101
103
  const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
102
104