@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml.h" // ggml_op
|
|
4
4
|
|
|
5
5
|
#include <string>
|
|
6
|
+
#include <set>
|
|
6
7
|
|
|
7
8
|
//
|
|
8
9
|
// gguf constants (sync with gguf.py)
|
|
@@ -79,6 +80,7 @@ enum llm_arch {
|
|
|
79
80
|
LLM_ARCH_JAIS,
|
|
80
81
|
LLM_ARCH_NEMOTRON,
|
|
81
82
|
LLM_ARCH_NEMOTRON_H,
|
|
83
|
+
LLM_ARCH_NEMOTRON_H_MOE,
|
|
82
84
|
LLM_ARCH_EXAONE,
|
|
83
85
|
LLM_ARCH_EXAONE4,
|
|
84
86
|
LLM_ARCH_RWKV6,
|
|
@@ -315,6 +317,7 @@ enum llm_tensor {
|
|
|
315
317
|
LLM_TENSOR_DENSE_3_OUT,
|
|
316
318
|
LLM_TENSOR_OUTPUT,
|
|
317
319
|
LLM_TENSOR_OUTPUT_NORM,
|
|
320
|
+
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
|
|
318
321
|
LLM_TENSOR_ROPE_FREQS,
|
|
319
322
|
LLM_TENSOR_ROPE_FACTORS_LONG,
|
|
320
323
|
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
|
@@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
|
|
|
525
528
|
const int bid;
|
|
526
529
|
const int xid;
|
|
527
530
|
|
|
531
|
+
const std::set<llm_tensor> model_tensors;
|
|
532
|
+
|
|
533
|
+
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
|
534
|
+
|
|
528
535
|
std::string str() const;
|
|
529
536
|
|
|
530
537
|
operator std::string() const {
|
|
@@ -546,11 +553,11 @@ struct LLM_TN {
|
|
|
546
553
|
llm_arch arch;
|
|
547
554
|
|
|
548
555
|
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
|
549
|
-
return
|
|
556
|
+
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
|
|
550
557
|
}
|
|
551
558
|
|
|
552
559
|
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
|
553
|
-
return
|
|
560
|
+
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
|
|
554
561
|
}
|
|
555
562
|
};
|
|
556
563
|
|
|
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
695
695
|
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
|
696
696
|
udata->output .resize(n_tokens);
|
|
697
697
|
|
|
698
|
+
udata->seq_id_data.reserve(n_tokens);
|
|
699
|
+
|
|
698
700
|
seq_set_t seq_set_unq;
|
|
699
701
|
|
|
700
702
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
716
718
|
}
|
|
717
719
|
|
|
718
720
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
|
719
|
-
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
|
720
721
|
udata->output[i] = batch.logits[idxs[i]];
|
|
721
722
|
|
|
722
723
|
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
|
723
|
-
|
|
724
|
+
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
|
725
|
+
|
|
726
|
+
udata->seq_id_data.push_back(seq_id);
|
|
727
|
+
seq_set_unq.set(seq_id);
|
|
724
728
|
}
|
|
725
729
|
|
|
726
730
|
if (udata->output[i]) {
|
|
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
728
732
|
}
|
|
729
733
|
}
|
|
730
734
|
|
|
735
|
+
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
|
736
|
+
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
737
|
+
udata->seq_id[i] = seq_id_ptr;
|
|
738
|
+
seq_id_ptr += udata->n_seq_id[i];
|
|
739
|
+
}
|
|
740
|
+
|
|
731
741
|
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
732
742
|
if (seq_set_unq.test(s)) {
|
|
733
743
|
udata->seq_idx[s] = udata->seq_id_unq.size();
|
|
@@ -56,13 +56,15 @@ struct llama_ubatch {
|
|
|
56
56
|
std::vector<float> embd;
|
|
57
57
|
std::vector<llama_pos> pos;
|
|
58
58
|
std::vector<int32_t> n_seq_id;
|
|
59
|
-
std::vector<llama_seq_id *> seq_id;
|
|
59
|
+
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
|
60
60
|
std::vector<llama_seq_id> seq_id_unq;
|
|
61
61
|
std::vector<int32_t> seq_idx;
|
|
62
62
|
std::vector<int8_t> output;
|
|
63
|
+
|
|
64
|
+
std::vector<llama_seq_id> seq_id_data;
|
|
63
65
|
};
|
|
64
66
|
|
|
65
|
-
// the llama_ubatch pointers above point to this data if set. otherwise -
|
|
67
|
+
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
|
66
68
|
std::shared_ptr<data_t> data;
|
|
67
69
|
};
|
|
68
70
|
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include "llama-model.h"
|
|
10
10
|
|
|
11
11
|
#include <cinttypes>
|
|
12
|
+
#include <cmath>
|
|
12
13
|
#include <cstring>
|
|
13
14
|
#include <limits>
|
|
14
15
|
#include <stdexcept>
|
|
@@ -72,6 +73,43 @@ llama_context::llama_context(
|
|
|
72
73
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
73
74
|
}
|
|
74
75
|
|
|
76
|
+
if (cparams.yarn_ext_factor != 0) {
|
|
77
|
+
static auto get_mscale = [](float scale, float mscale) {
|
|
78
|
+
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
const float factor = 1.0f / cparams.rope_freq_scale;
|
|
82
|
+
|
|
83
|
+
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
|
84
|
+
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
85
|
+
// note: here we assume `mscale == 1.0f`
|
|
86
|
+
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
|
87
|
+
float mscale = 1.0f;
|
|
88
|
+
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
89
|
+
|
|
90
|
+
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
91
|
+
// special-case DEEPSEEK v2:
|
|
92
|
+
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
|
93
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
|
94
|
+
mscale = mscale_all_dims;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
98
|
+
|
|
99
|
+
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
|
100
|
+
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
|
101
|
+
} else {
|
|
102
|
+
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
|
106
|
+
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
|
107
|
+
//
|
|
108
|
+
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
|
109
|
+
// https://github.com/ggml-org/llama.cpp/pull/17945
|
|
110
|
+
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
|
111
|
+
}
|
|
112
|
+
|
|
75
113
|
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
|
76
114
|
|
|
77
115
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
@@ -93,14 +131,6 @@ llama_context::llama_context(
|
|
|
93
131
|
// with causal attention, the batch size is limited by the context size
|
|
94
132
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
95
133
|
|
|
96
|
-
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
97
|
-
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
98
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
99
|
-
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
|
100
|
-
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
101
|
-
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
102
|
-
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
103
|
-
}
|
|
104
134
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
105
135
|
|
|
106
136
|
cparams.op_offload = params.op_offload;
|
|
@@ -228,6 +258,7 @@ llama_context::llama_context(
|
|
|
228
258
|
|
|
229
259
|
backend_buft.clear();
|
|
230
260
|
backend_ptrs.clear();
|
|
261
|
+
backend_buf_exp_size.clear();
|
|
231
262
|
|
|
232
263
|
for (auto & backend : backends) {
|
|
233
264
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
|
@@ -244,11 +275,15 @@ llama_context::llama_context(
|
|
|
244
275
|
|
|
245
276
|
backend_buft.push_back(buft);
|
|
246
277
|
backend_ptrs.push_back(backend.get());
|
|
278
|
+
backend_buf_exp_size.push_back(0);
|
|
247
279
|
}
|
|
248
280
|
|
|
249
281
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
|
250
282
|
|
|
251
|
-
const
|
|
283
|
+
const uint32_t n_seqs = cparams.n_seq_max;
|
|
284
|
+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
285
|
+
|
|
286
|
+
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
|
252
287
|
|
|
253
288
|
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
|
254
289
|
|
|
@@ -300,9 +335,6 @@ llama_context::llama_context(
|
|
|
300
335
|
|
|
301
336
|
cross.v_embd.clear();
|
|
302
337
|
|
|
303
|
-
const uint32_t n_seqs = cparams.n_seq_max;
|
|
304
|
-
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
305
|
-
|
|
306
338
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
307
339
|
n_outputs = n_seqs;
|
|
308
340
|
|
|
@@ -359,7 +391,8 @@ llama_context::llama_context(
|
|
|
359
391
|
|
|
360
392
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
|
361
393
|
{
|
|
362
|
-
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()
|
|
394
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
|
395
|
+
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
|
363
396
|
if (!gf) {
|
|
364
397
|
if (pipeline_parallel) {
|
|
365
398
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
|
@@ -377,7 +410,7 @@ llama_context::llama_context(
|
|
|
377
410
|
|
|
378
411
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
|
379
412
|
{
|
|
380
|
-
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
|
413
|
+
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
|
381
414
|
if (!gf) {
|
|
382
415
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
|
383
416
|
}
|
|
@@ -392,7 +425,7 @@ llama_context::llama_context(
|
|
|
392
425
|
//
|
|
393
426
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
|
394
427
|
//
|
|
395
|
-
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
428
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
|
396
429
|
if (!gf) {
|
|
397
430
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
398
431
|
}
|
|
@@ -401,11 +434,13 @@ llama_context::llama_context(
|
|
|
401
434
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
402
435
|
ggml_backend_t backend = backend_ptrs[i];
|
|
403
436
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
404
|
-
|
|
405
|
-
|
|
437
|
+
if (!model.hparams.no_alloc) {
|
|
438
|
+
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
439
|
+
}
|
|
440
|
+
if (backend_buf_exp_size[i] > 1) {
|
|
406
441
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
407
442
|
ggml_backend_buft_name(buft),
|
|
408
|
-
|
|
443
|
+
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
|
409
444
|
}
|
|
410
445
|
}
|
|
411
446
|
|
|
@@ -424,6 +459,23 @@ llama_context::llama_context(
|
|
|
424
459
|
}
|
|
425
460
|
|
|
426
461
|
llama_context::~llama_context() {
|
|
462
|
+
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
|
463
|
+
// if (!model.hparams.no_alloc) {
|
|
464
|
+
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
465
|
+
// ggml_backend_t backend = backend_ptrs[i];
|
|
466
|
+
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
467
|
+
|
|
468
|
+
// const size_t size_exp = backend_buf_exp_size[i];
|
|
469
|
+
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
470
|
+
// if (size_exp == size_act) {
|
|
471
|
+
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
|
472
|
+
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
473
|
+
// } else {
|
|
474
|
+
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
475
|
+
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
476
|
+
// }
|
|
477
|
+
// }
|
|
478
|
+
// }
|
|
427
479
|
ggml_opt_free(opt_ctx);
|
|
428
480
|
}
|
|
429
481
|
|
|
@@ -1326,6 +1378,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1326
1378
|
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
|
1327
1379
|
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
1328
1380
|
#endif
|
|
1381
|
+
synchronize();
|
|
1329
1382
|
buf_output = nullptr;
|
|
1330
1383
|
logits = nullptr;
|
|
1331
1384
|
embd = nullptr;
|
|
@@ -1386,9 +1439,9 @@ void llama_context::output_reorder() {
|
|
|
1386
1439
|
// graph
|
|
1387
1440
|
//
|
|
1388
1441
|
|
|
1389
|
-
uint32_t llama_context::graph_max_nodes() const {
|
|
1442
|
+
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|
1390
1443
|
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
|
1391
|
-
return std::max<uint32_t>(
|
|
1444
|
+
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
|
1392
1445
|
}
|
|
1393
1446
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1394
1447
|
}
|
|
@@ -1397,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
|
1397
1450
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
|
1398
1451
|
}
|
|
1399
1452
|
|
|
1400
|
-
ggml_cgraph * llama_context::graph_reserve(
|
|
1453
|
+
ggml_cgraph * llama_context::graph_reserve(
|
|
1454
|
+
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
|
1401
1455
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
1402
1456
|
GGML_ASSERT(n_outputs >= 1);
|
|
1403
1457
|
|
|
@@ -1434,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1434
1488
|
|
|
1435
1489
|
// initialize scheduler with the specified graph
|
|
1436
1490
|
if (split_only) {
|
|
1437
|
-
|
|
1491
|
+
if (sizes) {
|
|
1492
|
+
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
|
1493
|
+
} else {
|
|
1494
|
+
ggml_backend_sched_split_graph(sched.get(), gf);
|
|
1495
|
+
}
|
|
1438
1496
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
|
1497
|
+
GGML_ASSERT(!sizes);
|
|
1439
1498
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
|
1440
1499
|
return nullptr;
|
|
1441
1500
|
}
|
|
@@ -2057,15 +2116,26 @@ void llama_context::perf_reset() {
|
|
|
2057
2116
|
|
|
2058
2117
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
|
2059
2118
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
|
2060
|
-
for (const auto &
|
|
2061
|
-
ret[
|
|
2119
|
+
for (const auto & [buft, size] : model.memory_breakdown()) {
|
|
2120
|
+
ret[buft].model += size;
|
|
2062
2121
|
}
|
|
2063
|
-
|
|
2064
|
-
|
|
2122
|
+
if (memory) {
|
|
2123
|
+
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
|
2124
|
+
ret[buft].context += size;
|
|
2125
|
+
}
|
|
2065
2126
|
}
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2127
|
+
if (model.hparams.no_alloc) {
|
|
2128
|
+
for (size_t i = 0; i < backends.size(); ++i) {
|
|
2129
|
+
ggml_backend_t backend = backends[i].get();
|
|
2130
|
+
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
|
2131
|
+
ret[buft].compute += backend_buf_exp_size[i];
|
|
2132
|
+
}
|
|
2133
|
+
} else {
|
|
2134
|
+
for (const auto & backend_ptr : backends) {
|
|
2135
|
+
ggml_backend_t backend = backend_ptr.get();
|
|
2136
|
+
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
|
2137
|
+
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
2138
|
+
}
|
|
2069
2139
|
}
|
|
2070
2140
|
return ret;
|
|
2071
2141
|
}
|
|
@@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
|
|
26
26
|
size_t model = 0; // memory allocated for the model
|
|
27
27
|
size_t context = 0; // memory allocated for the context
|
|
28
28
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
29
|
+
|
|
30
|
+
size_t total() const {
|
|
31
|
+
return model + context + compute;
|
|
32
|
+
}
|
|
29
33
|
};
|
|
30
34
|
|
|
31
35
|
struct llama_context {
|
|
@@ -197,7 +201,7 @@ private:
|
|
|
197
201
|
//
|
|
198
202
|
|
|
199
203
|
public:
|
|
200
|
-
uint32_t graph_max_nodes() const;
|
|
204
|
+
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
|
201
205
|
|
|
202
206
|
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
|
203
207
|
llm_graph_result * get_gf_res_reserve() const;
|
|
@@ -206,7 +210,8 @@ public:
|
|
|
206
210
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
|
207
211
|
|
|
208
212
|
// reserve a graph with a dummy ubatch of the specified size
|
|
209
|
-
ggml_cgraph * graph_reserve(
|
|
213
|
+
ggml_cgraph * graph_reserve(
|
|
214
|
+
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
|
210
215
|
|
|
211
216
|
private:
|
|
212
217
|
llm_graph_params graph_params(
|
|
@@ -281,9 +286,10 @@ private:
|
|
|
281
286
|
|
|
282
287
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
|
283
288
|
|
|
284
|
-
// buffer types used for the compute buffer of each backend
|
|
289
|
+
// pointers and buffer types used for the compute buffer of each backend
|
|
285
290
|
std::vector<ggml_backend_t> backend_ptrs;
|
|
286
291
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
292
|
+
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
|
287
293
|
|
|
288
294
|
llm_graph_result_ptr gf_res_prev;
|
|
289
295
|
llm_graph_result_ptr gf_res_reserve;
|