@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "ggml.h" // ggml_op
|
|
4
4
|
|
|
5
5
|
#include <string>
|
|
6
|
+
#include <set>
|
|
6
7
|
|
|
7
8
|
//
|
|
8
9
|
// gguf constants (sync with gguf.py)
|
|
@@ -79,6 +80,7 @@ enum llm_arch {
|
|
|
79
80
|
LLM_ARCH_JAIS,
|
|
80
81
|
LLM_ARCH_NEMOTRON,
|
|
81
82
|
LLM_ARCH_NEMOTRON_H,
|
|
83
|
+
LLM_ARCH_NEMOTRON_H_MOE,
|
|
82
84
|
LLM_ARCH_EXAONE,
|
|
83
85
|
LLM_ARCH_EXAONE4,
|
|
84
86
|
LLM_ARCH_RWKV6,
|
|
@@ -315,6 +317,7 @@ enum llm_tensor {
|
|
|
315
317
|
LLM_TENSOR_DENSE_3_OUT,
|
|
316
318
|
LLM_TENSOR_OUTPUT,
|
|
317
319
|
LLM_TENSOR_OUTPUT_NORM,
|
|
320
|
+
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
|
|
318
321
|
LLM_TENSOR_ROPE_FREQS,
|
|
319
322
|
LLM_TENSOR_ROPE_FACTORS_LONG,
|
|
320
323
|
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
|
@@ -525,6 +528,10 @@ struct LLM_TN_IMPL {
|
|
|
525
528
|
const int bid;
|
|
526
529
|
const int xid;
|
|
527
530
|
|
|
531
|
+
const std::set<llm_tensor> model_tensors;
|
|
532
|
+
|
|
533
|
+
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
|
534
|
+
|
|
528
535
|
std::string str() const;
|
|
529
536
|
|
|
530
537
|
operator std::string() const {
|
|
@@ -546,11 +553,11 @@ struct LLM_TN {
|
|
|
546
553
|
llm_arch arch;
|
|
547
554
|
|
|
548
555
|
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
|
549
|
-
return
|
|
556
|
+
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
|
|
550
557
|
}
|
|
551
558
|
|
|
552
559
|
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
|
553
|
-
return
|
|
560
|
+
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
|
|
554
561
|
}
|
|
555
562
|
};
|
|
556
563
|
|
|
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
695
695
|
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
|
696
696
|
udata->output .resize(n_tokens);
|
|
697
697
|
|
|
698
|
+
udata->seq_id_data.reserve(n_tokens);
|
|
699
|
+
|
|
698
700
|
seq_set_t seq_set_unq;
|
|
699
701
|
|
|
700
702
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
716
718
|
}
|
|
717
719
|
|
|
718
720
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
|
719
|
-
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
|
720
721
|
udata->output[i] = batch.logits[idxs[i]];
|
|
721
722
|
|
|
722
723
|
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
|
723
|
-
|
|
724
|
+
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
|
725
|
+
|
|
726
|
+
udata->seq_id_data.push_back(seq_id);
|
|
727
|
+
seq_set_unq.set(seq_id);
|
|
724
728
|
}
|
|
725
729
|
|
|
726
730
|
if (udata->output[i]) {
|
|
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
728
732
|
}
|
|
729
733
|
}
|
|
730
734
|
|
|
735
|
+
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
|
736
|
+
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
737
|
+
udata->seq_id[i] = seq_id_ptr;
|
|
738
|
+
seq_id_ptr += udata->n_seq_id[i];
|
|
739
|
+
}
|
|
740
|
+
|
|
731
741
|
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
732
742
|
if (seq_set_unq.test(s)) {
|
|
733
743
|
udata->seq_idx[s] = udata->seq_id_unq.size();
|
|
@@ -56,13 +56,15 @@ struct llama_ubatch {
|
|
|
56
56
|
std::vector<float> embd;
|
|
57
57
|
std::vector<llama_pos> pos;
|
|
58
58
|
std::vector<int32_t> n_seq_id;
|
|
59
|
-
std::vector<llama_seq_id *> seq_id;
|
|
59
|
+
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
|
60
60
|
std::vector<llama_seq_id> seq_id_unq;
|
|
61
61
|
std::vector<int32_t> seq_idx;
|
|
62
62
|
std::vector<int8_t> output;
|
|
63
|
+
|
|
64
|
+
std::vector<llama_seq_id> seq_id_data;
|
|
63
65
|
};
|
|
64
66
|
|
|
65
|
-
// the llama_ubatch pointers above point to this data if set. otherwise -
|
|
67
|
+
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
|
66
68
|
std::shared_ptr<data_t> data;
|
|
67
69
|
};
|
|
68
70
|
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include "llama-model.h"
|
|
10
10
|
|
|
11
11
|
#include <cinttypes>
|
|
12
|
+
#include <cmath>
|
|
12
13
|
#include <cstring>
|
|
13
14
|
#include <limits>
|
|
14
15
|
#include <stdexcept>
|
|
@@ -72,6 +73,43 @@ llama_context::llama_context(
|
|
|
72
73
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
73
74
|
}
|
|
74
75
|
|
|
76
|
+
if (cparams.yarn_ext_factor != 0) {
|
|
77
|
+
static auto get_mscale = [](float scale, float mscale) {
|
|
78
|
+
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
const float factor = 1.0f / cparams.rope_freq_scale;
|
|
82
|
+
|
|
83
|
+
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
|
84
|
+
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
85
|
+
// note: here we assume `mscale == 1.0f`
|
|
86
|
+
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
|
87
|
+
float mscale = 1.0f;
|
|
88
|
+
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
89
|
+
|
|
90
|
+
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
91
|
+
// special-case DEEPSEEK v2:
|
|
92
|
+
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
|
93
|
+
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
|
94
|
+
mscale = mscale_all_dims;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
98
|
+
|
|
99
|
+
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
|
100
|
+
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
|
101
|
+
} else {
|
|
102
|
+
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
|
106
|
+
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
|
107
|
+
//
|
|
108
|
+
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
|
109
|
+
// https://github.com/ggml-org/llama.cpp/pull/17945
|
|
110
|
+
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
|
111
|
+
}
|
|
112
|
+
|
|
75
113
|
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
|
76
114
|
|
|
77
115
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
@@ -93,14 +131,6 @@ llama_context::llama_context(
|
|
|
93
131
|
// with causal attention, the batch size is limited by the context size
|
|
94
132
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
95
133
|
|
|
96
|
-
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
97
|
-
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
98
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
99
|
-
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
|
100
|
-
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
101
|
-
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
102
|
-
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
103
|
-
}
|
|
104
134
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
105
135
|
|
|
106
136
|
cparams.op_offload = params.op_offload;
|
|
@@ -228,6 +258,7 @@ llama_context::llama_context(
|
|
|
228
258
|
|
|
229
259
|
backend_buft.clear();
|
|
230
260
|
backend_ptrs.clear();
|
|
261
|
+
backend_buf_exp_size.clear();
|
|
231
262
|
|
|
232
263
|
for (auto & backend : backends) {
|
|
233
264
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
|
@@ -244,6 +275,7 @@ llama_context::llama_context(
|
|
|
244
275
|
|
|
245
276
|
backend_buft.push_back(buft);
|
|
246
277
|
backend_ptrs.push_back(backend.get());
|
|
278
|
+
backend_buf_exp_size.push_back(0);
|
|
247
279
|
}
|
|
248
280
|
|
|
249
281
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
|
@@ -359,7 +391,8 @@ llama_context::llama_context(
|
|
|
359
391
|
|
|
360
392
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
|
361
393
|
{
|
|
362
|
-
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()
|
|
394
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
|
395
|
+
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
|
363
396
|
if (!gf) {
|
|
364
397
|
if (pipeline_parallel) {
|
|
365
398
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
|
@@ -377,7 +410,7 @@ llama_context::llama_context(
|
|
|
377
410
|
|
|
378
411
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
|
379
412
|
{
|
|
380
|
-
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
|
413
|
+
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
|
381
414
|
if (!gf) {
|
|
382
415
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
|
383
416
|
}
|
|
@@ -392,7 +425,7 @@ llama_context::llama_context(
|
|
|
392
425
|
//
|
|
393
426
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
|
394
427
|
//
|
|
395
|
-
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
428
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
|
396
429
|
if (!gf) {
|
|
397
430
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
398
431
|
}
|
|
@@ -401,11 +434,13 @@ llama_context::llama_context(
|
|
|
401
434
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
402
435
|
ggml_backend_t backend = backend_ptrs[i];
|
|
403
436
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
404
|
-
|
|
405
|
-
|
|
437
|
+
if (!model.hparams.no_alloc) {
|
|
438
|
+
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
439
|
+
}
|
|
440
|
+
if (backend_buf_exp_size[i] > 1) {
|
|
406
441
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
407
442
|
ggml_backend_buft_name(buft),
|
|
408
|
-
|
|
443
|
+
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
|
409
444
|
}
|
|
410
445
|
}
|
|
411
446
|
|
|
@@ -424,6 +459,23 @@ llama_context::llama_context(
|
|
|
424
459
|
}
|
|
425
460
|
|
|
426
461
|
llama_context::~llama_context() {
|
|
462
|
+
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
|
463
|
+
// if (!model.hparams.no_alloc) {
|
|
464
|
+
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
|
465
|
+
// ggml_backend_t backend = backend_ptrs[i];
|
|
466
|
+
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
|
467
|
+
|
|
468
|
+
// const size_t size_exp = backend_buf_exp_size[i];
|
|
469
|
+
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
470
|
+
// if (size_exp == size_act) {
|
|
471
|
+
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
|
472
|
+
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
473
|
+
// } else {
|
|
474
|
+
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
|
475
|
+
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
|
476
|
+
// }
|
|
477
|
+
// }
|
|
478
|
+
// }
|
|
427
479
|
ggml_opt_free(opt_ctx);
|
|
428
480
|
}
|
|
429
481
|
|
|
@@ -1326,6 +1378,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1326
1378
|
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
|
1327
1379
|
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
|
1328
1380
|
#endif
|
|
1381
|
+
synchronize();
|
|
1329
1382
|
buf_output = nullptr;
|
|
1330
1383
|
logits = nullptr;
|
|
1331
1384
|
embd = nullptr;
|
|
@@ -1397,7 +1450,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
|
1397
1450
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
|
1398
1451
|
}
|
|
1399
1452
|
|
|
1400
|
-
ggml_cgraph * llama_context::graph_reserve(
|
|
1453
|
+
ggml_cgraph * llama_context::graph_reserve(
|
|
1454
|
+
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
|
1401
1455
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
1402
1456
|
GGML_ASSERT(n_outputs >= 1);
|
|
1403
1457
|
|
|
@@ -1434,8 +1488,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1434
1488
|
|
|
1435
1489
|
// initialize scheduler with the specified graph
|
|
1436
1490
|
if (split_only) {
|
|
1437
|
-
|
|
1491
|
+
if (sizes) {
|
|
1492
|
+
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
|
1493
|
+
} else {
|
|
1494
|
+
ggml_backend_sched_split_graph(sched.get(), gf);
|
|
1495
|
+
}
|
|
1438
1496
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
|
1497
|
+
GGML_ASSERT(!sizes);
|
|
1439
1498
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
|
1440
1499
|
return nullptr;
|
|
1441
1500
|
}
|
|
@@ -2057,15 +2116,26 @@ void llama_context::perf_reset() {
|
|
|
2057
2116
|
|
|
2058
2117
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
|
2059
2118
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
|
2060
|
-
for (const auto &
|
|
2061
|
-
ret[
|
|
2119
|
+
for (const auto & [buft, size] : model.memory_breakdown()) {
|
|
2120
|
+
ret[buft].model += size;
|
|
2062
2121
|
}
|
|
2063
|
-
|
|
2064
|
-
|
|
2122
|
+
if (memory) {
|
|
2123
|
+
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
|
2124
|
+
ret[buft].context += size;
|
|
2125
|
+
}
|
|
2065
2126
|
}
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2127
|
+
if (model.hparams.no_alloc) {
|
|
2128
|
+
for (size_t i = 0; i < backends.size(); ++i) {
|
|
2129
|
+
ggml_backend_t backend = backends[i].get();
|
|
2130
|
+
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
|
2131
|
+
ret[buft].compute += backend_buf_exp_size[i];
|
|
2132
|
+
}
|
|
2133
|
+
} else {
|
|
2134
|
+
for (const auto & backend_ptr : backends) {
|
|
2135
|
+
ggml_backend_t backend = backend_ptr.get();
|
|
2136
|
+
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
|
2137
|
+
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
|
2138
|
+
}
|
|
2069
2139
|
}
|
|
2070
2140
|
return ret;
|
|
2071
2141
|
}
|
|
@@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
|
|
26
26
|
size_t model = 0; // memory allocated for the model
|
|
27
27
|
size_t context = 0; // memory allocated for the context
|
|
28
28
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
|
29
|
+
|
|
30
|
+
size_t total() const {
|
|
31
|
+
return model + context + compute;
|
|
32
|
+
}
|
|
29
33
|
};
|
|
30
34
|
|
|
31
35
|
struct llama_context {
|
|
@@ -206,7 +210,8 @@ public:
|
|
|
206
210
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
|
207
211
|
|
|
208
212
|
// reserve a graph with a dummy ubatch of the specified size
|
|
209
|
-
ggml_cgraph * graph_reserve(
|
|
213
|
+
ggml_cgraph * graph_reserve(
|
|
214
|
+
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
|
210
215
|
|
|
211
216
|
private:
|
|
212
217
|
llm_graph_params graph_params(
|
|
@@ -281,9 +286,10 @@ private:
|
|
|
281
286
|
|
|
282
287
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
|
283
288
|
|
|
284
|
-
// buffer types used for the compute buffer of each backend
|
|
289
|
+
// pointers and buffer types used for the compute buffer of each backend
|
|
285
290
|
std::vector<ggml_backend_t> backend_ptrs;
|
|
286
291
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
292
|
+
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
|
287
293
|
|
|
288
294
|
llm_graph_result_ptr gf_res_prev;
|
|
289
295
|
llm_graph_result_ptr gf_res_reserve;
|
|
@@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|
|
78
78
|
for (int i = 0; i < n_tokens; ++i) {
|
|
79
79
|
const float pos = ubatch->pos[i];
|
|
80
80
|
attn_scale_data[i] = std::log(
|
|
81
|
-
std::floor((pos +
|
|
81
|
+
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
|
82
82
|
) * f_attn_temp_scale + 1.0;
|
|
83
83
|
}
|
|
84
84
|
|
|
@@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
|
|
254
254
|
}
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
+
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
|
258
|
+
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
|
259
|
+
|
|
260
|
+
this->mctx = mctx;
|
|
261
|
+
|
|
262
|
+
bool res = true;
|
|
263
|
+
|
|
264
|
+
res &= s_copy->ne[0] == mctx->get_n_rs();
|
|
265
|
+
|
|
266
|
+
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
|
267
|
+
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
|
268
|
+
|
|
269
|
+
res &= head == mctx->get_head();
|
|
270
|
+
res &= rs_z == mctx->get_rs_z();
|
|
271
|
+
|
|
272
|
+
return res;
|
|
273
|
+
}
|
|
274
|
+
|
|
257
275
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
258
276
|
GGML_UNUSED(ubatch);
|
|
259
277
|
|
|
@@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
|
|
385
403
|
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
386
404
|
|
|
387
405
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
|
388
|
-
res &= self_kq_mask->ne[1] ==
|
|
406
|
+
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
389
407
|
|
|
390
408
|
return res;
|
|
391
409
|
}
|
|
@@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|
|
416
434
|
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
417
435
|
|
|
418
436
|
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
|
419
|
-
res &= self_kq_mask->ne[1] ==
|
|
437
|
+
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
420
438
|
|
|
421
439
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
|
422
|
-
res &= self_kq_mask_swa->ne[1] ==
|
|
440
|
+
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
|
423
441
|
|
|
424
442
|
return res;
|
|
425
443
|
}
|
|
@@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
452
470
|
}
|
|
453
471
|
}
|
|
454
472
|
|
|
455
|
-
for (int i = n_tokens; i <
|
|
473
|
+
for (int i = n_tokens; i < n_tokens; ++i) {
|
|
456
474
|
for (int j = 0; j < n_enc; ++j) {
|
|
457
475
|
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
|
458
476
|
}
|
|
@@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
461
479
|
}
|
|
462
480
|
|
|
463
481
|
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
|
464
|
-
inp_attn->
|
|
465
|
-
|
|
482
|
+
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
|
483
|
+
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
|
484
|
+
|
|
485
|
+
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
|
486
|
+
|
|
487
|
+
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
|
488
|
+
|
|
489
|
+
if (inp_rs->s_copy) {
|
|
490
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
|
491
|
+
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
|
492
|
+
|
|
493
|
+
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
|
494
|
+
for (uint32_t i = 0; i < n_rs; ++i) {
|
|
495
|
+
data[i] = mctx->get_recr()->s_copy(i);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
|
501
|
+
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
|
502
|
+
|
|
503
|
+
this->mctx = mctx;
|
|
504
|
+
|
|
505
|
+
bool res = true;
|
|
506
|
+
|
|
507
|
+
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
|
508
|
+
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
|
509
|
+
|
|
510
|
+
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
|
511
|
+
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
|
512
|
+
|
|
513
|
+
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
|
514
|
+
|
|
515
|
+
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
|
516
|
+
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
|
517
|
+
|
|
518
|
+
res &= inp_rs->head == mctx->get_recr()->get_head();
|
|
519
|
+
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
|
520
|
+
|
|
521
|
+
return res;
|
|
466
522
|
}
|
|
467
523
|
|
|
468
524
|
//
|
|
@@ -1089,6 +1145,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1089
1145
|
cur = ggml_relu(ctx0, cur);
|
|
1090
1146
|
cb(cur, "ffn_moe_relu", il);
|
|
1091
1147
|
} break;
|
|
1148
|
+
case LLM_FFN_RELU_SQR:
|
|
1149
|
+
if (gate_exps) {
|
|
1150
|
+
// TODO: add support for gated squared relu
|
|
1151
|
+
GGML_ABORT("fatal error: gated squared relu not implemented");
|
|
1152
|
+
} else {
|
|
1153
|
+
cur = ggml_relu(ctx0, cur);
|
|
1154
|
+
cur = ggml_sqr(ctx0, cur);
|
|
1155
|
+
cb(cur, "ffn_moe_relu_sqr", il);
|
|
1156
|
+
} break;
|
|
1092
1157
|
default:
|
|
1093
1158
|
GGML_ABORT("fatal error");
|
|
1094
1159
|
}
|
|
@@ -1203,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|
|
1203
1268
|
}
|
|
1204
1269
|
|
|
1205
1270
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|
1206
|
-
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
|
1271
|
+
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
|
1207
1272
|
|
|
1208
1273
|
auto & cur = inp->attn_scale;
|
|
1209
1274
|
|
|
@@ -1470,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|
|
1470
1535
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
|
1471
1536
|
|
|
1472
1537
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
|
1473
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens,
|
|
1538
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
|
1474
1539
|
ggml_set_input(inp->self_kq_mask);
|
|
1475
1540
|
|
|
1476
1541
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1477
1542
|
|
|
1478
1543
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
1479
|
-
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens,
|
|
1544
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
|
1480
1545
|
ggml_set_input(inp->self_kq_mask_swa);
|
|
1481
1546
|
|
|
1482
1547
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
@@ -1558,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
|
|
1558
1623
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
|
1559
1624
|
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
|
1560
1625
|
|
|
1561
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1626
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1562
1627
|
ggml_set_input(inp->self_kq_mask);
|
|
1563
1628
|
|
|
1564
1629
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
@@ -1701,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
|
|
1701
1766
|
|
|
1702
1767
|
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
|
1703
1768
|
|
|
1704
|
-
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc,
|
|
1769
|
+
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
|
1705
1770
|
ggml_set_input(inp->cross_kq_mask);
|
|
1706
1771
|
|
|
1707
1772
|
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
|
@@ -1767,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1767
1832
|
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
|
1768
1833
|
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
|
1769
1834
|
|
|
1770
|
-
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1835
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1771
1836
|
ggml_set_input(inp->self_kq_mask);
|
|
1772
1837
|
|
|
1773
1838
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
@@ -1781,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
|
|
1781
1846
|
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
|
1782
1847
|
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
|
1783
1848
|
|
|
1784
|
-
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv,
|
|
1849
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
|
1785
1850
|
ggml_set_input(inp->self_kq_mask_swa);
|
|
1786
1851
|
|
|
1787
1852
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
@@ -1841,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
|
|
1841
1906
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
|
1842
1907
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
|
1843
1908
|
|
|
1909
|
+
inp->head = mctx_cur->get_head();
|
|
1910
|
+
inp->rs_z = mctx_cur->get_rs_z();
|
|
1911
|
+
|
|
1844
1912
|
return inp;
|
|
1845
1913
|
}
|
|
1846
1914
|
|
|
@@ -1909,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|
|
1909
1977
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
|
1910
1978
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
|
1911
1979
|
|
|
1912
|
-
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
|
1980
|
+
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
|
1913
1981
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
|
1914
1982
|
|
|
1915
|
-
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
1983
|
+
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
|
1916
1984
|
|
|
1917
1985
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
|
1918
1986
|
}
|
|
@@ -132,8 +132,8 @@ public:
|
|
|
132
132
|
// temperature tuning, used by llama4
|
|
133
133
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
|
134
134
|
public:
|
|
135
|
-
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
|
136
|
-
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
|
135
|
+
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
|
136
|
+
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
|
137
137
|
virtual ~llm_graph_input_attn_temp() = default;
|
|
138
138
|
|
|
139
139
|
void set_input(const llama_ubatch * ubatch) override;
|
|
@@ -142,6 +142,7 @@ public:
|
|
|
142
142
|
|
|
143
143
|
const uint32_t n_attn_temp_floor_scale;
|
|
144
144
|
const float f_attn_temp_scale;
|
|
145
|
+
const float f_attn_temp_offset;
|
|
145
146
|
};
|
|
146
147
|
|
|
147
148
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
|
@@ -224,6 +225,8 @@ public:
|
|
|
224
225
|
|
|
225
226
|
void set_input(const llama_ubatch * ubatch) override;
|
|
226
227
|
|
|
228
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
229
|
+
|
|
227
230
|
ggml_tensor * s_copy; // I32 [n_rs]
|
|
228
231
|
|
|
229
232
|
// views of s_copy, computed once per graph
|
|
@@ -232,6 +235,10 @@ public:
|
|
|
232
235
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
|
233
236
|
|
|
234
237
|
const llama_memory_recurrent_context * mctx;
|
|
238
|
+
|
|
239
|
+
// used in view offsets, need to match for valid graph reuse
|
|
240
|
+
uint32_t head;
|
|
241
|
+
int32_t rs_z;
|
|
235
242
|
};
|
|
236
243
|
|
|
237
244
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
|
@@ -364,22 +371,28 @@ public:
|
|
|
364
371
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
365
372
|
public:
|
|
366
373
|
llm_graph_input_mem_hybrid(
|
|
374
|
+
const llama_cparams & cparams,
|
|
367
375
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
|
368
|
-
std::unique_ptr<llm_graph_input_rs>
|
|
369
|
-
const llama_memory_hybrid_context *
|
|
376
|
+
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
377
|
+
const llama_memory_hybrid_context * mctx) :
|
|
370
378
|
inp_attn(std::move(inp_attn)),
|
|
371
379
|
inp_rs(std::move(inp_rs)),
|
|
380
|
+
cparams(cparams),
|
|
372
381
|
mctx(mctx) { }
|
|
373
382
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
|
374
383
|
|
|
375
384
|
void set_input(const llama_ubatch * ubatch) override;
|
|
376
385
|
|
|
386
|
+
bool can_reuse(const llm_graph_params & params) override;
|
|
387
|
+
|
|
377
388
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
|
378
389
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
|
379
390
|
|
|
380
391
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
|
381
392
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
|
382
393
|
|
|
394
|
+
const llama_cparams cparams;
|
|
395
|
+
|
|
383
396
|
const llama_memory_hybrid_context * mctx;
|
|
384
397
|
};
|
|
385
398
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#include "llama-hparams.h"
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
|
|
5
|
+
#include <algorithm>
|
|
4
6
|
#include <cassert>
|
|
5
7
|
|
|
6
8
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
|
@@ -229,3 +231,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
|
|
229
231
|
|
|
230
232
|
return false;
|
|
231
233
|
}
|
|
234
|
+
|
|
235
|
+
bool llama_hparams::use_mrope() const {
|
|
236
|
+
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
|
237
|
+
}
|