@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
|
78
78
|
for (int i = 0; i < n_tokens; ++i) {
|
|
79
79
|
const float pos = ubatch->pos[i];
|
|
80
80
|
attn_scale_data[i] = std::log(
|
|
81
|
-
std::floor((pos +
|
|
81
|
+
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
|
82
82
|
) * f_attn_temp_scale + 1.0;
|
|
83
83
|
}
|
|
84
84
|
|
|
@@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
|
|
254
254
|
}
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
+
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
|
258
|
+
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
|
259
|
+
|
|
260
|
+
this->mctx = mctx;
|
|
261
|
+
|
|
262
|
+
bool res = true;
|
|
263
|
+
|
|
264
|
+
res &= s_copy->ne[0] == mctx->get_n_rs();
|
|
265
|
+
|
|
266
|
+
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
|
267
|
+
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
|
268
|
+
|
|
269
|
+
res &= head == mctx->get_head();
|
|
270
|
+
res &= rs_z == mctx->get_rs_z();
|
|
271
|
+
|
|
272
|
+
return res;
|
|
273
|
+
}
|
|
274
|
+
|
|
257
275
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
258
276
|
GGML_UNUSED(ubatch);
|
|
259
277
|
|
|
@@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|
|
385
403
|
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
386
404
|
|
|
387
405
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
|
388
|
-
res &= self_kq_mask->ne[1] ==
|
|
406
|
+
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
389
407
|
|
|
390
408
|
return res;
|
|
391
409
|
}
|
|
@@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|
|
416
434
|
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
417
435
|
|
|
418
436
|
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
|
419
|
-
res &= self_kq_mask->ne[1] ==
|
|
437
|
+
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
420
438
|
|
|
421
439
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
|
422
|
-
res &= self_kq_mask_swa->ne[1] ==
|
|
440
|
+
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
|
423
441
|
|
|
424
442
|
return res;
|
|
425
443
|
}
|
|
@@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
452
470
|
}
|
|
453
471
|
}
|
|
454
472
|
|
|
455
|
-
for (int i = n_tokens; i <
|
|
473
|
+
for (int i = n_tokens; i < n_tokens; ++i) {
|
|
456
474
|
for (int j = 0; j < n_enc; ++j) {
|
|
457
475
|
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
|
458
476
|
}
|
|
@@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
461
479
|
}
|
|
462
480
|
|
|
463
481
|
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
|
464
|
-
inp_attn->
|
|
465
|
-
|
|
482
|
+
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
|
483
|
+
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
|
484
|
+
|
|
485
|
+
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
|
486
|
+
|
|
487
|
+
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
|
488
|
+
|
|
489
|
+
if (inp_rs->s_copy) {
|
|
490
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
|
491
|
+
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
|
492
|
+
|
|
493
|
+
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
|
494
|
+
for (uint32_t i = 0; i < n_rs; ++i) {
|
|
495
|
+
data[i] = mctx->get_recr()->s_copy(i);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
|
501
|
+
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
|
502
|
+
|
|
503
|
+
this->mctx = mctx;
|
|
504
|
+
|
|
505
|
+
bool res = true;
|
|
506
|
+
|
|
507
|
+
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
|
508
|
+
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
509
|
+
|
|
510
|
+
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
|
511
|
+
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
512
|
+
|
|
513
|
+
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
|
514
|
+
|
|
515
|
+
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
|
516
|
+
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
|
517
|
+
|
|
518
|
+
res &= inp_rs->head == mctx->get_recr()->get_head();
|
|
519
|
+
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
|
520
|
+
|
|
521
|
+
return res;
|
|
466
522
|
}
|
|
467
523
|
|
|
468
524
|
//
|
|
@@ -973,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
973
1029
|
|
|
974
1030
|
// mask out the other groups
|
|
975
1031
|
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
|
976
|
-
selection_probs = ggml_set_rows(ctx0,
|
|
1032
|
+
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
|
977
1033
|
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
|
978
1034
|
cb(selection_probs, "ffn_moe_probs_masked", il);
|
|
979
1035
|
}
|
|
@@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1089
1145
|
cur = ggml_relu(ctx0, cur);
|
|
1090
1146
|
cb(cur, "ffn_moe_relu", il);
|
|
1091
1147
|
} break;
|
|
1148
|
+
case LLM_FFN_RELU_SQR:
|
|
1149
|
+
if (gate_exps) {
|
|
1150
|
+
// TODO: add support for gated squared relu
|
|
1151
|
+
GGML_ABORT("fatal error: gated squared relu not implemented");
|
|
1152
|
+
} else {
|
|
1153
|
+
cur = ggml_relu(ctx0, cur);
|
|
1154
|
+
cur = ggml_sqr(ctx0, cur);
|
|
1155
|
+
cb(cur, "ffn_moe_relu_sqr", il);
|
|
1156
|
+
} break;
|
|
1092
1157
|
default:
|
|
1093
1158
|
GGML_ABORT("fatal error");
|
|
1094
1159
|
}
|
|
@@ -1203,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
|
1203
1268
|
}
|
|
1204
1269
|
|
|
1205
1270
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|
1206
|
-
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
|
1271
|
+
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
|
1207
1272
|
|
|
1208
1273
|
auto & cur = inp->attn_scale;
|
|
1209
1274
|
|
|
@@ -1470,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|
|
1470
1535
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
|
1471
1536
|
|
|
1472
1537
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
|
1473
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens,
|
|
1538
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
|
1474
1539
|
ggml_set_input(inp->self_kq_mask);
|
|
1475
1540
|
|
|
1476
1541
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1477
1542
|
|
|
1478
1543
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
1479
|
-
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens,
|
|
1544
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
|
1480
1545
|
ggml_set_input(inp->self_kq_mask_swa);
|
|
1481
1546
|
|
|
1482
1547
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
@@ -1558,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
|
|
1558
1623
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
|
1559
1624
|
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
|
1560
1625
|
|
|
1561
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1626
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1562
1627
|
ggml_set_input(inp->self_kq_mask);
|
|
1563
1628
|
|
|
1564
1629
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
@@ -1701,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|
|
1701
1766
|
|
|
1702
1767
|
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
|
1703
1768
|
|
|
1704
|
-
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc,
|
|
1769
|
+
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
|
1705
1770
|
ggml_set_input(inp->cross_kq_mask);
|
|
1706
1771
|
|
|
1707
1772
|
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
|
@@ -1767,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1767
1832
|
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
|
1768
1833
|
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
|
1769
1834
|
|
|
1770
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1835
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1771
1836
|
ggml_set_input(inp->self_kq_mask);
|
|
1772
1837
|
|
|
1773
1838
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
@@ -1781,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1781
1846
|
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
|
1782
1847
|
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
|
1783
1848
|
|
|
1784
|
-
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1849
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1785
1850
|
ggml_set_input(inp->self_kq_mask_swa);
|
|
1786
1851
|
|
|
1787
1852
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
@@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
|
|
1841
1906
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
|
1842
1907
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
|
1843
1908
|
|
|
1909
|
+
inp->head = mctx_cur->get_head();
|
|
1910
|
+
inp->rs_z = mctx_cur->get_rs_z();
|
|
1911
|
+
|
|
1844
1912
|
return inp;
|
|
1845
1913
|
}
|
|
1846
1914
|
|
|
@@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|
|
1909
1977
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
1910
1978
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
|
1911
1979
|
|
|
1912
|
-
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
|
1980
|
+
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
|
1913
1981
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
|
1914
1982
|
|
|
1915
|
-
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
1983
|
+
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
1916
1984
|
|
|
1917
1985
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
|
1918
1986
|
}
|
|
@@ -132,8 +132,8 @@ public:
|
|
|
132
132
|
// temperature tuning, used by llama4
|
|
133
133
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
|
134
134
|
public:
|
|
135
|
-
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
|
136
|
-
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
|
135
|
+
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
|
136
|
+
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
|
137
137
|
virtual ~llm_graph_input_attn_temp() = default;
|
|
138
138
|
|
|
139
139
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -142,6 +142,7 @@ public:
|
|
|
142
142
|
|
|
143
143
|
const uint32_t n_attn_temp_floor_scale;
|
|
144
144
|
const float f_attn_temp_scale;
|
|
145
|
+
const float f_attn_temp_offset;
|
|
145
146
|
};
|
|
146
147
|
|
|
147
148
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
|
@@ -224,6 +225,8 @@ public:
|
|
|
224
225
|
|
|
225
226
|
void set_input(const llama_ubatch * ubatch) override;
|
|
226
227
|
|
|
228
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
229
|
+
|
|
227
230
|
ggml_tensor * s_copy; // I32 [n_rs]
|
|
228
231
|
|
|
229
232
|
// views of s_copy, computed once per graph
|
|
@@ -232,6 +235,10 @@ public:
|
|
|
232
235
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
|
233
236
|
|
|
234
237
|
const llama_memory_recurrent_context * mctx;
|
|
238
|
+
|
|
239
|
+
// used in view offsets, need to match for valid graph reuse
|
|
240
|
+
uint32_t head;
|
|
241
|
+
int32_t rs_z;
|
|
235
242
|
};
|
|
236
243
|
|
|
237
244
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
|
@@ -364,22 +371,28 @@ public:
|
|
|
364
371
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
365
372
|
public:
|
|
366
373
|
llm_graph_input_mem_hybrid(
|
|
374
|
+
const llama_cparams & cparams,
|
|
367
375
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
|
368
|
-
std::unique_ptr<llm_graph_input_rs>
|
|
369
|
-
const llama_memory_hybrid_context *
|
|
376
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
377
|
+
const llama_memory_hybrid_context * mctx) :
|
|
370
378
|
inp_attn(std::move(inp_attn)),
|
|
371
379
|
inp_rs(std::move(inp_rs)),
|
|
380
|
+
cparams(cparams),
|
|
372
381
|
mctx(mctx) { }
|
|
373
382
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
|
374
383
|
|
|
375
384
|
void set_input(const llama_ubatch * ubatch) override;
|
|
376
385
|
|
|
386
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
387
|
+
|
|
377
388
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
|
378
389
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
379
390
|
|
|
380
391
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
|
381
392
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
382
393
|
|
|
394
|
+
const llama_cparams cparams;
|
|
395
|
+
|
|
383
396
|
const llama_memory_hybrid_context * mctx;
|
|
384
397
|
};
|
|
385
398
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#include "llama-hparams.h"
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
|
|
5
|
+
#include <algorithm>
|
|
4
6
|
#include <cassert>
|
|
5
7
|
|
|
6
8
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
|
@@ -229,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
|
|
229
231
|
|
|
230
232
|
return false;
|
|
231
233
|
}
|
|
234
|
+
|
|
235
|
+
bool llama_hparams::use_mrope() const {
|
|
236
|
+
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
|
237
|
+
}
|
|
@@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
|
|
34
34
|
|
|
35
35
|
struct llama_hparams {
|
|
36
36
|
bool vocab_only;
|
|
37
|
+
bool no_alloc;
|
|
37
38
|
bool rope_finetuned;
|
|
38
39
|
bool use_par_res;
|
|
39
40
|
bool swin_norm;
|
|
@@ -107,6 +108,7 @@ struct llama_hparams {
|
|
|
107
108
|
float rope_freq_base_train_swa;
|
|
108
109
|
float rope_freq_scale_train;
|
|
109
110
|
float rope_freq_scale_train_swa;
|
|
111
|
+
|
|
110
112
|
uint32_t n_ctx_orig_yarn;
|
|
111
113
|
float rope_yarn_log_mul = 0.0f;
|
|
112
114
|
|
|
@@ -164,6 +166,7 @@ struct llama_hparams {
|
|
|
164
166
|
uint32_t n_no_rope_layer_step = 4;
|
|
165
167
|
uint32_t n_attn_temp_floor_scale = 0;
|
|
166
168
|
float f_attn_temp_scale = 0.0f;
|
|
169
|
+
float f_attn_temp_offset = 0.0f; // offset position index
|
|
167
170
|
|
|
168
171
|
// gemma3n altup
|
|
169
172
|
uint32_t n_altup = 4; // altup_num_inputs
|
|
@@ -267,7 +270,8 @@ struct llama_hparams {
|
|
|
267
270
|
// TODO: think of a better place for this function
|
|
268
271
|
// TODO: pack the SWA params in a struct?
|
|
269
272
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
|
273
|
+
|
|
274
|
+
bool use_mrope() const;
|
|
270
275
|
};
|
|
271
276
|
|
|
272
277
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
|
273
|
-
|
|
@@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
|
|
25
25
|
}
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
+
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
|
29
|
+
ggml_log_get(log_callback, user_data);
|
|
30
|
+
}
|
|
31
|
+
|
|
28
32
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
29
33
|
ggml_log_set(log_callback, user_data);
|
|
30
34
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
|
@@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
|
|
175
175
|
|
|
176
176
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
177
177
|
for (auto & [buft, ctx] : ctx_map) {
|
|
178
|
-
ggml_backend_buffer_t buf
|
|
178
|
+
ggml_backend_buffer_t buf;
|
|
179
|
+
if (model.hparams.no_alloc) {
|
|
180
|
+
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
|
181
|
+
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
|
182
|
+
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
|
183
|
+
}
|
|
184
|
+
} else {
|
|
185
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
|
186
|
+
}
|
|
179
187
|
if (!buf) {
|
|
180
188
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
|
181
189
|
}
|
|
@@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
482
490
|
|
|
483
491
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
484
492
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
485
|
-
for (const auto & [
|
|
486
|
-
|
|
493
|
+
for (const auto & [ctx, buf] : ctxs_bufs) {
|
|
494
|
+
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
|
495
|
+
|
|
496
|
+
if (hparams.no_alloc) {
|
|
497
|
+
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
|
498
|
+
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
|
499
|
+
} else {
|
|
500
|
+
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
|
501
|
+
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
|
502
|
+
}
|
|
487
503
|
}
|
|
504
|
+
|
|
488
505
|
return ret;
|
|
489
506
|
}
|
|
490
507
|
|
|
@@ -1232,8 +1249,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|
|
1232
1249
|
GGML_ASSERT(n_tokens%n_stream == 0);
|
|
1233
1250
|
|
|
1234
1251
|
// n_tps == n_tokens_per_stream
|
|
1235
|
-
const int64_t n_tps
|
|
1236
|
-
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
|
|
1252
|
+
const int64_t n_tps = n_tokens/n_stream;
|
|
1237
1253
|
|
|
1238
1254
|
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
|
1239
1255
|
|
|
@@ -1266,7 +1282,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
|
|
1266
1282
|
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
|
1267
1283
|
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
|
1268
1284
|
|
|
1269
|
-
const uint64_t idst = n_kv*(h*n_stream*
|
|
1285
|
+
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
|
1270
1286
|
|
|
1271
1287
|
for (uint32_t j = 0; j < n_kv; ++j) {
|
|
1272
1288
|
if (cells.is_empty(j)) {
|
|
@@ -1370,9 +1386,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|
|
1370
1386
|
float freq_scale) const {
|
|
1371
1387
|
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
|
1372
1388
|
|
|
1373
|
-
const auto & yarn_ext_factor
|
|
1374
|
-
const auto & yarn_beta_fast
|
|
1375
|
-
const auto & yarn_beta_slow
|
|
1389
|
+
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
|
1390
|
+
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
|
1391
|
+
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
|
1392
|
+
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
|
1376
1393
|
|
|
1377
1394
|
const auto & n_rot = hparams.n_rot;
|
|
1378
1395
|
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
|
@@ -1383,12 +1400,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|
|
1383
1400
|
? LLAMA_ROPE_TYPE_NEOX
|
|
1384
1401
|
: hparams.rope_type;
|
|
1385
1402
|
|
|
1386
|
-
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
|
1387
|
-
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
1388
|
-
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
|
1389
|
-
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
|
1390
|
-
: cparams.yarn_attn_factor;
|
|
1391
|
-
|
|
1392
1403
|
ggml_tensor * tmp;
|
|
1393
1404
|
|
|
1394
1405
|
if (ggml_is_quantized(cur->type)) {
|
|
@@ -1550,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
|
|
1550
1561
|
|
|
1551
1562
|
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
|
1552
1563
|
|
|
1564
|
+
slot_info sinfo;
|
|
1565
|
+
|
|
1553
1566
|
bool res = true;
|
|
1554
|
-
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
|
1555
|
-
res = res && state_read_data(io, strm, cell_count);
|
|
1567
|
+
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
|
1568
|
+
res = res && state_read_data(io, strm, cell_count, sinfo);
|
|
1556
1569
|
|
|
1557
1570
|
if (!res) {
|
|
1558
1571
|
if (seq_id == -1) {
|
|
@@ -1691,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|
|
1691
1704
|
}
|
|
1692
1705
|
}
|
|
1693
1706
|
|
|
1694
|
-
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
1707
|
+
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
|
1695
1708
|
auto & cells = v_cells[strm];
|
|
1696
1709
|
auto & head = v_heads[strm];
|
|
1697
1710
|
|
|
@@ -1728,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1728
1741
|
ubatch.seq_id[i] = &dest_seq_id;
|
|
1729
1742
|
}
|
|
1730
1743
|
|
|
1731
|
-
|
|
1744
|
+
sinfo = find_slot(ubatch, false);
|
|
1732
1745
|
if (sinfo.empty()) {
|
|
1733
1746
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
|
1734
1747
|
return false;
|
|
@@ -1738,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1738
1751
|
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
|
1739
1752
|
apply_ubatch(sinfo, ubatch);
|
|
1740
1753
|
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
|
1744
|
-
head = head_cur;
|
|
1754
|
+
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
|
1745
1755
|
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
|
1756
|
+
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
|
1757
|
+
GGML_ASSERT(sinfo.n_stream() == 1);
|
|
1758
|
+
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
|
1759
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1760
|
+
const uint32_t idx = sinfo.idxs[0][i];
|
|
1761
|
+
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
|
1762
|
+
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
|
1763
|
+
}
|
|
1755
1764
|
} else {
|
|
1756
1765
|
// whole KV cache restore
|
|
1757
1766
|
|
|
@@ -1784,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1784
1793
|
}
|
|
1785
1794
|
}
|
|
1786
1795
|
|
|
1796
|
+
// Create contiguous slot_info for whole cache restore
|
|
1797
|
+
sinfo.s0 = strm;
|
|
1798
|
+
sinfo.s1 = strm;
|
|
1799
|
+
sinfo.resize(1);
|
|
1800
|
+
sinfo.strm[0] = strm;
|
|
1801
|
+
sinfo.idxs[0].resize(cell_count);
|
|
1802
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1803
|
+
sinfo.idxs[0][i] = i;
|
|
1804
|
+
}
|
|
1805
|
+
|
|
1787
1806
|
head = 0;
|
|
1788
1807
|
}
|
|
1789
1808
|
|
|
1790
1809
|
return true;
|
|
1791
1810
|
}
|
|
1792
1811
|
|
|
1793
|
-
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
|
1812
|
+
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
|
1794
1813
|
auto & cells = v_cells[strm];
|
|
1795
|
-
auto & head = v_heads[strm];
|
|
1796
1814
|
|
|
1797
1815
|
uint32_t v_trans;
|
|
1798
1816
|
uint32_t n_layer;
|
|
@@ -1842,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1842
1860
|
}
|
|
1843
1861
|
|
|
1844
1862
|
if (cell_count) {
|
|
1845
|
-
|
|
1846
|
-
|
|
1863
|
+
if (sinfo.is_contiguous()) {
|
|
1864
|
+
// Fast path: contiguous cells, single memcpy
|
|
1865
|
+
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
|
1866
|
+
} else {
|
|
1867
|
+
// Slow path: scatter to non-contiguous positions
|
|
1868
|
+
const void * src = io.read(cell_count * k_size_row);
|
|
1869
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1870
|
+
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
|
1871
|
+
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1847
1874
|
}
|
|
1848
1875
|
}
|
|
1849
1876
|
|
|
@@ -1874,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1874
1901
|
}
|
|
1875
1902
|
|
|
1876
1903
|
if (cell_count) {
|
|
1877
|
-
|
|
1878
|
-
|
|
1904
|
+
if (sinfo.is_contiguous()) {
|
|
1905
|
+
// Fast path: contiguous cells, single memcpy
|
|
1906
|
+
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
|
1907
|
+
} else {
|
|
1908
|
+
// Slow path: scatter to non-contiguous positions
|
|
1909
|
+
const void * src = io.read(cell_count * v_size_row);
|
|
1910
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1911
|
+
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
|
1912
|
+
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
|
1913
|
+
}
|
|
1914
|
+
}
|
|
1879
1915
|
}
|
|
1880
1916
|
}
|
|
1881
1917
|
} else {
|
|
@@ -1914,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|
|
1914
1950
|
}
|
|
1915
1951
|
|
|
1916
1952
|
if (cell_count) {
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
const
|
|
1920
|
-
|
|
1953
|
+
if (sinfo.is_contiguous()) {
|
|
1954
|
+
// Fast path: contiguous cells
|
|
1955
|
+
const uint32_t h = sinfo.head();
|
|
1956
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1957
|
+
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
|
1958
|
+
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
|
1959
|
+
}
|
|
1960
|
+
} else {
|
|
1961
|
+
// Slow path: scatter to non-contiguous positions
|
|
1962
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1963
|
+
const void * src = io.read(cell_count * v_size_el);
|
|
1964
|
+
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1965
|
+
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
|
1966
|
+
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1921
1969
|
}
|
|
1922
1970
|
}
|
|
1923
1971
|
}
|
|
@@ -72,6 +72,23 @@ public:
|
|
|
72
72
|
void clear() {
|
|
73
73
|
idxs.clear();
|
|
74
74
|
}
|
|
75
|
+
|
|
76
|
+
// check if indices are contiguous starting from head()
|
|
77
|
+
bool is_contiguous() const {
|
|
78
|
+
if (idxs.empty() || idxs[0].empty()) {
|
|
79
|
+
return true;
|
|
80
|
+
}
|
|
81
|
+
if (idxs.size() > 1) {
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
const uint32_t h = idxs[0][0];
|
|
85
|
+
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
|
86
|
+
if (idxs[0][i] != h + i) {
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
return true;
|
|
91
|
+
}
|
|
75
92
|
};
|
|
76
93
|
|
|
77
94
|
using slot_info_vec_t = std::vector<slot_info>;
|
|
@@ -264,8 +281,8 @@ private:
|
|
|
264
281
|
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
|
265
282
|
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
|
266
283
|
|
|
267
|
-
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
|
268
|
-
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
|
284
|
+
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
|
285
|
+
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
|
269
286
|
};
|
|
270
287
|
|
|
271
288
|
class llama_kv_cache_context : public llama_memory_context_i {
|
|
@@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|
|
222
222
|
ubatches(std::move(ubatches)),
|
|
223
223
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
224
224
|
ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
|
225
|
-
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),
|
|
225
|
+
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
|
226
226
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
227
227
|
}
|
|
228
228
|
|
|
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
|
|
473
473
|
std::vector<std::string> & splits,
|
|
474
474
|
bool use_mmap,
|
|
475
475
|
bool check_tensors,
|
|
476
|
+
bool no_alloc,
|
|
476
477
|
const llama_model_kv_override * param_overrides_p,
|
|
477
478
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
|
478
479
|
int trace = 0;
|
|
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
|
|
716
717
|
|
|
717
718
|
this->use_mmap = use_mmap;
|
|
718
719
|
this->check_tensors = check_tensors;
|
|
720
|
+
this->no_alloc = no_alloc;
|
|
719
721
|
}
|
|
720
722
|
|
|
721
723
|
std::string llama_model_loader::get_arch_name() const {
|
|
@@ -71,6 +71,7 @@ struct llama_model_loader {
|
|
|
71
71
|
|
|
72
72
|
bool use_mmap = false;
|
|
73
73
|
bool check_tensors;
|
|
74
|
+
bool no_alloc;
|
|
74
75
|
|
|
75
76
|
llama_files files;
|
|
76
77
|
llama_ftype ftype;
|
|
@@ -97,6 +98,7 @@ struct llama_model_loader {
|
|
|
97
98
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
|
98
99
|
bool use_mmap,
|
|
99
100
|
bool check_tensors,
|
|
101
|
+
bool no_alloc,
|
|
100
102
|
const llama_model_kv_override * param_overrides_p,
|
|
101
103
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
|
102
104
|
|