@fugood/llama.node 1.1.8 → 1.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +9 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +14 -1
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +34 -3
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
- package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
- package/src/llama.cpp/src/llama-memory.h +11 -8
- package/src/llama.cpp/src/llama-model.cpp +396 -187
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#include "llama-kv-cache
|
|
1
|
+
#include "llama-kv-cache.h"
|
|
2
2
|
|
|
3
3
|
#include "llama-impl.h"
|
|
4
4
|
#include "llama-io.h"
|
|
@@ -13,36 +13,29 @@
|
|
|
13
13
|
#include <stdexcept>
|
|
14
14
|
|
|
15
15
|
//
|
|
16
|
-
//
|
|
16
|
+
// llama_kv_cache
|
|
17
17
|
//
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
const llama_model &
|
|
21
|
-
|
|
22
|
-
ggml_type
|
|
23
|
-
|
|
24
|
-
bool
|
|
25
|
-
bool
|
|
26
|
-
|
|
27
|
-
uint32_t
|
|
28
|
-
uint32_t
|
|
29
|
-
uint32_t
|
|
30
|
-
|
|
31
|
-
|
|
19
|
+
llama_kv_cache::llama_kv_cache(
|
|
20
|
+
const llama_model & model,
|
|
21
|
+
ggml_type type_k,
|
|
22
|
+
ggml_type type_v,
|
|
23
|
+
bool v_trans,
|
|
24
|
+
bool offload,
|
|
25
|
+
bool unified,
|
|
26
|
+
uint32_t kv_size,
|
|
27
|
+
uint32_t n_seq_max,
|
|
28
|
+
uint32_t n_pad,
|
|
29
|
+
uint32_t n_swa,
|
|
30
|
+
llama_swa_type swa_type,
|
|
31
|
+
const layer_filter_cb & filter,
|
|
32
|
+
const layer_reuse_cb & reuse) :
|
|
32
33
|
model(model), hparams(model.hparams), v_trans(v_trans),
|
|
33
34
|
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
|
34
35
|
|
|
35
36
|
GGML_ASSERT(kv_size % n_pad == 0);
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
auto n_layer_cache = hparams.n_layer;
|
|
39
|
-
if (model.arch == LLM_ARCH_GEMMA3N) {
|
|
40
|
-
n_layer_cache = 20;
|
|
41
|
-
}
|
|
42
|
-
if (model.arch == LLM_ARCH_GLM4_MOE) {
|
|
43
|
-
// GLM-4.5: Only process up to last layer, skip final NextN layer
|
|
44
|
-
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
|
|
45
|
-
}
|
|
38
|
+
const uint32_t n_layer_kv = hparams.n_layer_kv();
|
|
46
39
|
|
|
47
40
|
// create a context for each buffer type
|
|
48
41
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -50,7 +43,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
50
43
|
auto it = ctx_map.find(buft);
|
|
51
44
|
if (it == ctx_map.end()) {
|
|
52
45
|
ggml_init_params params = {
|
|
53
|
-
/*.mem_size =*/ size_t(2u*(1 + n_stream)*
|
|
46
|
+
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
|
|
54
47
|
/*.mem_buffer =*/ NULL,
|
|
55
48
|
/*.no_alloc =*/ true,
|
|
56
49
|
};
|
|
@@ -97,9 +90,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
97
90
|
__func__, hparams.n_embd_v_gqa_max());
|
|
98
91
|
}
|
|
99
92
|
|
|
100
|
-
for (uint32_t il = 0; il <
|
|
93
|
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
|
94
|
+
if (!hparams.has_kv(il)) {
|
|
95
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
|
|
101
99
|
if (filter && !filter(il)) {
|
|
102
|
-
LLAMA_LOG_DEBUG("%s: layer %3d:
|
|
100
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
|
|
103
101
|
continue;
|
|
104
102
|
}
|
|
105
103
|
|
|
@@ -147,23 +145,27 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
147
145
|
layers.push_back({ il, k, v, k_stream, v_stream, });
|
|
148
146
|
}
|
|
149
147
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
|
|
148
|
+
if (reuse) {
|
|
149
|
+
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
|
|
153
150
|
|
|
154
|
-
for (uint32_t il =
|
|
155
|
-
|
|
156
|
-
|
|
151
|
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
|
152
|
+
const int32_t il_reuse = reuse(il);
|
|
153
|
+
|
|
154
|
+
if (il_reuse < 0) {
|
|
155
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
|
|
157
156
|
continue;
|
|
158
157
|
}
|
|
159
158
|
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
if (filter && !filter(il)) {
|
|
160
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
162
163
|
|
|
163
164
|
GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
|
|
165
|
+
|
|
164
166
|
map_layer_ids[il] = map_layer_ids[il_reuse];
|
|
165
167
|
|
|
166
|
-
LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d,
|
|
168
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
|
|
167
169
|
}
|
|
168
170
|
}
|
|
169
171
|
|
|
@@ -209,7 +211,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
209
211
|
}
|
|
210
212
|
}
|
|
211
213
|
|
|
212
|
-
void
|
|
214
|
+
void llama_kv_cache::clear(bool data) {
|
|
213
215
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
214
216
|
v_cells[s].reset();
|
|
215
217
|
v_heads[s] = 0;
|
|
@@ -222,7 +224,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
|
|
222
224
|
}
|
|
223
225
|
}
|
|
224
226
|
|
|
225
|
-
bool
|
|
227
|
+
bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
226
228
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
227
229
|
|
|
228
230
|
if (p0 < 0) {
|
|
@@ -285,7 +287,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
285
287
|
return true;
|
|
286
288
|
}
|
|
287
289
|
|
|
288
|
-
void
|
|
290
|
+
void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
289
291
|
GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
|
|
290
292
|
GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
|
|
291
293
|
|
|
@@ -368,7 +370,7 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
|
|
|
368
370
|
//}
|
|
369
371
|
}
|
|
370
372
|
|
|
371
|
-
void
|
|
373
|
+
void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
|
|
372
374
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
373
375
|
|
|
374
376
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -390,7 +392,7 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
|
|
|
390
392
|
}
|
|
391
393
|
}
|
|
392
394
|
|
|
393
|
-
void
|
|
395
|
+
void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
394
396
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
395
397
|
|
|
396
398
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -434,7 +436,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
434
436
|
head = new_head != cells.size() ? new_head : 0;
|
|
435
437
|
}
|
|
436
438
|
|
|
437
|
-
void
|
|
439
|
+
void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
438
440
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
439
441
|
|
|
440
442
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -467,7 +469,7 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
467
469
|
}
|
|
468
470
|
}
|
|
469
471
|
|
|
470
|
-
llama_pos
|
|
472
|
+
llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const {
|
|
471
473
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
472
474
|
|
|
473
475
|
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -475,7 +477,7 @@ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
|
|
|
475
477
|
return cells.seq_pos_min(seq_id);
|
|
476
478
|
}
|
|
477
479
|
|
|
478
|
-
llama_pos
|
|
480
|
+
llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
479
481
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
480
482
|
|
|
481
483
|
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -483,7 +485,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
483
485
|
return cells.seq_pos_max(seq_id);
|
|
484
486
|
}
|
|
485
487
|
|
|
486
|
-
llama_memory_context_ptr
|
|
488
|
+
llama_memory_context_ptr llama_kv_cache::init_batch(
|
|
487
489
|
llama_batch_allocr & balloc,
|
|
488
490
|
uint32_t n_ubatch,
|
|
489
491
|
bool embd_all) {
|
|
@@ -513,62 +515,34 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch(
|
|
|
513
515
|
break;
|
|
514
516
|
}
|
|
515
517
|
|
|
516
|
-
return std::make_unique<
|
|
518
|
+
return std::make_unique<llama_kv_cache_context>(
|
|
517
519
|
this, std::move(sinfos), std::move(ubatches));
|
|
518
520
|
} while (false);
|
|
519
521
|
|
|
520
|
-
return std::make_unique<
|
|
522
|
+
return std::make_unique<llama_kv_cache_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
521
523
|
}
|
|
522
524
|
|
|
523
|
-
llama_memory_context_ptr
|
|
524
|
-
return std::make_unique<
|
|
525
|
+
llama_memory_context_ptr llama_kv_cache::init_full() {
|
|
526
|
+
return std::make_unique<llama_kv_cache_context>(this);
|
|
525
527
|
}
|
|
526
528
|
|
|
527
|
-
llama_memory_context_ptr
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
defrag_info dinfo;
|
|
531
|
-
|
|
532
|
-
// see if we need to defrag
|
|
533
|
-
if (n_stream == 1) {
|
|
534
|
-
// note : for now do not consider defrag for n_stream > 1
|
|
535
|
-
const auto & cells = v_cells[seq_to_stream[0]];
|
|
536
|
-
|
|
537
|
-
bool do_defrag = optimize;
|
|
538
|
-
|
|
539
|
-
const auto thold = lctx->get_cparams().defrag_thold;
|
|
540
|
-
|
|
541
|
-
if (!do_defrag && thold > 0.0f) {
|
|
542
|
-
const auto n_kv = cells.used_max_p1();
|
|
543
|
-
|
|
544
|
-
// - do not defrag small contexts (i.e. < 2048 tokens)
|
|
545
|
-
// - count the padding towards the number of used tokens
|
|
546
|
-
const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
|
|
547
|
-
|
|
548
|
-
if (fragmentation > thold) {
|
|
549
|
-
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
|
550
|
-
|
|
551
|
-
do_defrag = true;
|
|
552
|
-
}
|
|
553
|
-
}
|
|
529
|
+
llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
|
|
530
|
+
GGML_UNUSED(optimize);
|
|
554
531
|
|
|
555
|
-
|
|
556
|
-
dinfo = defrag_prepare(lctx->graph_max_nodes());
|
|
557
|
-
}
|
|
558
|
-
}
|
|
532
|
+
bool do_shift = get_has_shift();
|
|
559
533
|
|
|
560
|
-
return std::make_unique<
|
|
534
|
+
return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
|
|
561
535
|
}
|
|
562
536
|
|
|
563
|
-
|
|
564
|
-
|
|
537
|
+
llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
|
|
538
|
+
llama_kv_cache::slot_info_vec_t res;
|
|
565
539
|
|
|
566
540
|
struct state_t {
|
|
567
541
|
slot_info sinfo; // slot info for the ubatch
|
|
568
542
|
|
|
569
543
|
std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
|
|
570
544
|
|
|
571
|
-
std::vector<
|
|
545
|
+
std::vector<llama_kv_cells> v_cells; // copy of the old cells, before placing the ubatch
|
|
572
546
|
};
|
|
573
547
|
|
|
574
548
|
// remember the old state of the cells so we can restore it in the end
|
|
@@ -629,7 +603,7 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st
|
|
|
629
603
|
return res;
|
|
630
604
|
}
|
|
631
605
|
|
|
632
|
-
bool
|
|
606
|
+
bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
|
|
633
607
|
bool updated = false;
|
|
634
608
|
|
|
635
609
|
auto * sched = lctx->get_sched();
|
|
@@ -699,57 +673,10 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|
|
699
673
|
}
|
|
700
674
|
}
|
|
701
675
|
|
|
702
|
-
if (!dinfo.empty()) {
|
|
703
|
-
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
|
704
|
-
|
|
705
|
-
// note: for now do not consider defrag for n_stream > 1
|
|
706
|
-
auto & cells = v_cells[seq_to_stream[0]];
|
|
707
|
-
auto & head = v_heads[seq_to_stream[0]];
|
|
708
|
-
|
|
709
|
-
// apply moves:
|
|
710
|
-
{
|
|
711
|
-
const auto n_kv = dinfo.ids.size();
|
|
712
|
-
|
|
713
|
-
for (uint32_t i = 0; i < n_kv; ++i) {
|
|
714
|
-
assert(dinfo.ids[i] <= n_kv);
|
|
715
|
-
|
|
716
|
-
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
|
717
|
-
continue;
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
cells.mv(i, dinfo.ids[i]);
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
// reset the head so we can find the first free slot during the next ubatch
|
|
724
|
-
head = 0;
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
ggml_backend_sched_reset(sched);
|
|
728
|
-
|
|
729
|
-
auto * res = lctx->get_gf_res_reserve();
|
|
730
|
-
|
|
731
|
-
res->reset();
|
|
732
|
-
|
|
733
|
-
auto * gf = build_graph_defrag(res, lctx, dinfo);
|
|
734
|
-
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
|
735
|
-
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
|
736
|
-
return updated;
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
res->set_inputs(nullptr);
|
|
740
|
-
|
|
741
|
-
if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
|
742
|
-
LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
|
|
743
|
-
return updated;
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
updated = true;
|
|
747
|
-
}
|
|
748
|
-
|
|
749
676
|
return updated;
|
|
750
677
|
}
|
|
751
678
|
|
|
752
|
-
|
|
679
|
+
llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
|
753
680
|
|
|
754
681
|
if (debug > 0) {
|
|
755
682
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
@@ -948,7 +875,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
|
|
|
948
875
|
return res;
|
|
949
876
|
}
|
|
950
877
|
|
|
951
|
-
void
|
|
878
|
+
void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
|
|
952
879
|
// keep track of the max sequence position that we would overwrite with this ubatch
|
|
953
880
|
// for non-SWA cache, this would be always empty
|
|
954
881
|
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
|
|
@@ -1013,21 +940,21 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u
|
|
|
1013
940
|
}
|
|
1014
941
|
}
|
|
1015
942
|
|
|
1016
|
-
bool
|
|
943
|
+
bool llama_kv_cache::get_can_shift() const {
|
|
1017
944
|
return true;
|
|
1018
945
|
}
|
|
1019
946
|
|
|
1020
|
-
uint32_t
|
|
947
|
+
uint32_t llama_kv_cache::get_size() const {
|
|
1021
948
|
const auto & cells = v_cells[seq_to_stream[0]];
|
|
1022
949
|
|
|
1023
950
|
return cells.size();
|
|
1024
951
|
}
|
|
1025
952
|
|
|
1026
|
-
uint32_t
|
|
953
|
+
uint32_t llama_kv_cache::get_n_stream() const {
|
|
1027
954
|
return n_stream;
|
|
1028
955
|
}
|
|
1029
956
|
|
|
1030
|
-
bool
|
|
957
|
+
bool llama_kv_cache::get_has_shift() const {
|
|
1031
958
|
bool result = false;
|
|
1032
959
|
|
|
1033
960
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1037,7 +964,7 @@ bool llama_kv_cache_unified::get_has_shift() const {
|
|
|
1037
964
|
return result;
|
|
1038
965
|
}
|
|
1039
966
|
|
|
1040
|
-
uint32_t
|
|
967
|
+
uint32_t llama_kv_cache::get_n_kv() const {
|
|
1041
968
|
uint32_t result = 0;
|
|
1042
969
|
|
|
1043
970
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1049,11 +976,11 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
|
|
|
1049
976
|
return result;
|
|
1050
977
|
}
|
|
1051
978
|
|
|
1052
|
-
bool
|
|
979
|
+
bool llama_kv_cache::get_supports_set_rows() const {
|
|
1053
980
|
return supports_set_rows;
|
|
1054
981
|
}
|
|
1055
982
|
|
|
1056
|
-
ggml_tensor *
|
|
983
|
+
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
1057
984
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1058
985
|
|
|
1059
986
|
auto * k = layers[ikv].k;
|
|
@@ -1073,7 +1000,7 @@ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint
|
|
|
1073
1000
|
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
|
1074
1001
|
}
|
|
1075
1002
|
|
|
1076
|
-
ggml_tensor *
|
|
1003
|
+
ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
1077
1004
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1078
1005
|
|
|
1079
1006
|
auto * v = layers[ikv].v;
|
|
@@ -1105,7 +1032,7 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
|
|
|
1105
1032
|
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
|
1106
1033
|
}
|
|
1107
1034
|
|
|
1108
|
-
ggml_tensor *
|
|
1035
|
+
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1109
1036
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1110
1037
|
|
|
1111
1038
|
auto * k = layers[ikv].k;
|
|
@@ -1135,7 +1062,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_
|
|
|
1135
1062
|
return ggml_cpy(ctx, k_cur, k_view);
|
|
1136
1063
|
}
|
|
1137
1064
|
|
|
1138
|
-
ggml_tensor *
|
|
1065
|
+
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1139
1066
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1140
1067
|
|
|
1141
1068
|
auto * v = layers[ikv].v;
|
|
@@ -1189,7 +1116,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
|
|
|
1189
1116
|
return ggml_cpy(ctx, v_cur, v_view);
|
|
1190
1117
|
}
|
|
1191
1118
|
|
|
1192
|
-
ggml_tensor *
|
|
1119
|
+
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
1193
1120
|
const uint32_t n_tokens = ubatch.n_tokens;
|
|
1194
1121
|
|
|
1195
1122
|
ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
|
|
@@ -1199,7 +1126,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, con
|
|
|
1199
1126
|
return k_idxs;
|
|
1200
1127
|
}
|
|
1201
1128
|
|
|
1202
|
-
ggml_tensor *
|
|
1129
|
+
ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
1203
1130
|
const uint32_t n_tokens = ubatch.n_tokens;
|
|
1204
1131
|
|
|
1205
1132
|
ggml_tensor * v_idxs;
|
|
@@ -1215,7 +1142,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, con
|
|
|
1215
1142
|
return v_idxs;
|
|
1216
1143
|
}
|
|
1217
1144
|
|
|
1218
|
-
void
|
|
1145
|
+
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1219
1146
|
if (!supports_set_rows) {
|
|
1220
1147
|
return;
|
|
1221
1148
|
}
|
|
@@ -1235,7 +1162,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba
|
|
|
1235
1162
|
}
|
|
1236
1163
|
}
|
|
1237
1164
|
|
|
1238
|
-
void
|
|
1165
|
+
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1239
1166
|
if (!supports_set_rows) {
|
|
1240
1167
|
return;
|
|
1241
1168
|
}
|
|
@@ -1272,7 +1199,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
|
|
|
1272
1199
|
}
|
|
1273
1200
|
}
|
|
1274
1201
|
|
|
1275
|
-
void
|
|
1202
|
+
void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
|
1276
1203
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
1277
1204
|
|
|
1278
1205
|
int32_t * data = (int32_t *) dst->data;
|
|
@@ -1286,7 +1213,7 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
|
|
|
1286
1213
|
}
|
|
1287
1214
|
}
|
|
1288
1215
|
|
|
1289
|
-
void
|
|
1216
|
+
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
1290
1217
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1291
1218
|
|
|
1292
1219
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
@@ -1358,7 +1285,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
|
|
1358
1285
|
}
|
|
1359
1286
|
}
|
|
1360
1287
|
|
|
1361
|
-
void
|
|
1288
|
+
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
1362
1289
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
1363
1290
|
|
|
1364
1291
|
GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
|
|
@@ -1383,7 +1310,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
|
|
|
1383
1310
|
}
|
|
1384
1311
|
}
|
|
1385
1312
|
|
|
1386
|
-
size_t
|
|
1313
|
+
size_t llama_kv_cache::total_size() const {
|
|
1387
1314
|
size_t size = 0;
|
|
1388
1315
|
|
|
1389
1316
|
for (const auto & buf : bufs) {
|
|
@@ -1393,7 +1320,7 @@ size_t llama_kv_cache_unified::total_size() const {
|
|
|
1393
1320
|
return size;
|
|
1394
1321
|
}
|
|
1395
1322
|
|
|
1396
|
-
size_t
|
|
1323
|
+
size_t llama_kv_cache::size_k_bytes() const {
|
|
1397
1324
|
size_t size_k_bytes = 0;
|
|
1398
1325
|
|
|
1399
1326
|
for (const auto & layer : layers) {
|
|
@@ -1403,7 +1330,7 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
|
|
|
1403
1330
|
return size_k_bytes;
|
|
1404
1331
|
}
|
|
1405
1332
|
|
|
1406
|
-
size_t
|
|
1333
|
+
size_t llama_kv_cache::size_v_bytes() const {
|
|
1407
1334
|
size_t size_v_bytes = 0;
|
|
1408
1335
|
|
|
1409
1336
|
for (const auto & layer : layers) {
|
|
@@ -1413,7 +1340,7 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
|
|
|
1413
1340
|
return size_v_bytes;
|
|
1414
1341
|
}
|
|
1415
1342
|
|
|
1416
|
-
ggml_tensor *
|
|
1343
|
+
ggml_tensor * llama_kv_cache::build_rope_shift(
|
|
1417
1344
|
const llama_cparams & cparams,
|
|
1418
1345
|
ggml_context * ctx,
|
|
1419
1346
|
ggml_tensor * cur,
|
|
@@ -1465,14 +1392,14 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
|
|
|
1465
1392
|
|
|
1466
1393
|
class llm_graph_input_k_shift : public llm_graph_input_i {
|
|
1467
1394
|
public:
|
|
1468
|
-
llm_graph_input_k_shift(const
|
|
1395
|
+
llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {}
|
|
1469
1396
|
virtual ~llm_graph_input_k_shift() = default;
|
|
1470
1397
|
|
|
1471
1398
|
void set_input(const llama_ubatch * ubatch) override;
|
|
1472
1399
|
|
|
1473
1400
|
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
|
|
1474
1401
|
|
|
1475
|
-
const
|
|
1402
|
+
const llama_kv_cache * kv_self;
|
|
1476
1403
|
};
|
|
1477
1404
|
|
|
1478
1405
|
void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|
@@ -1483,7 +1410,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|
|
1483
1410
|
}
|
|
1484
1411
|
}
|
|
1485
1412
|
|
|
1486
|
-
ggml_cgraph *
|
|
1413
|
+
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
|
1487
1414
|
auto * ctx = res->get_ctx();
|
|
1488
1415
|
auto * gf = res->get_gf();
|
|
1489
1416
|
|
|
@@ -1525,284 +1452,7 @@ ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res,
|
|
|
1525
1452
|
return gf;
|
|
1526
1453
|
}
|
|
1527
1454
|
|
|
1528
|
-
|
|
1529
|
-
llm_graph_result * res,
|
|
1530
|
-
llama_context * lctx,
|
|
1531
|
-
const defrag_info & dinfo) const {
|
|
1532
|
-
auto * ctx = res->get_ctx();
|
|
1533
|
-
auto * gf = res->get_gf();
|
|
1534
|
-
|
|
1535
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
1536
|
-
|
|
1537
|
-
const auto & cells = v_cells[0];
|
|
1538
|
-
|
|
1539
|
-
const auto & ids = dinfo.ids;
|
|
1540
|
-
|
|
1541
|
-
const auto & cparams = lctx->get_cparams();
|
|
1542
|
-
|
|
1543
|
-
#if 0
|
|
1544
|
-
// CPU defrag
|
|
1545
|
-
//
|
|
1546
|
-
// TODO: optimizations are possible:
|
|
1547
|
-
// - multiple threads
|
|
1548
|
-
// - avoid copying to the host memory when already there
|
|
1549
|
-
//
|
|
1550
|
-
// likely not worth the effort, as we have ggml_graph based defrag
|
|
1551
|
-
//
|
|
1552
|
-
|
|
1553
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
1554
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
1555
|
-
|
|
1556
|
-
const uint32_t kv_size = size;
|
|
1557
|
-
|
|
1558
|
-
std::vector<uint8_t> buf_k;
|
|
1559
|
-
std::vector<uint8_t> buf_v;
|
|
1560
|
-
|
|
1561
|
-
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
1562
|
-
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
|
1563
|
-
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
|
1564
|
-
|
|
1565
|
-
const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
|
1566
|
-
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
|
1567
|
-
|
|
1568
|
-
buf_k.resize(k_size);
|
|
1569
|
-
buf_v.resize(v_size);
|
|
1570
|
-
|
|
1571
|
-
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
1572
|
-
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
1573
|
-
|
|
1574
|
-
// batch move [i, i+nm) to [id, id+nm)
|
|
1575
|
-
// note: cells can move only to a lower index
|
|
1576
|
-
for (uint32_t i = 0; i < n_kv; ++i) {
|
|
1577
|
-
const uint32_t id = ids[i];
|
|
1578
|
-
|
|
1579
|
-
if (i == id || id == n_kv) {
|
|
1580
|
-
continue;
|
|
1581
|
-
}
|
|
1582
|
-
|
|
1583
|
-
uint32_t nm = 1;
|
|
1584
|
-
|
|
1585
|
-
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
|
1586
|
-
nm++;
|
|
1587
|
-
}
|
|
1588
|
-
|
|
1589
|
-
// move keys
|
|
1590
|
-
{
|
|
1591
|
-
const int64_t os = i*k_size_row;
|
|
1592
|
-
const int64_t od = id*k_size_row;
|
|
1593
|
-
|
|
1594
|
-
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
|
1595
|
-
}
|
|
1596
|
-
|
|
1597
|
-
// move values (note: they are transposed)
|
|
1598
|
-
{
|
|
1599
|
-
const int64_t os = i;
|
|
1600
|
-
const int64_t od = id;
|
|
1601
|
-
|
|
1602
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1603
|
-
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
|
1604
|
-
}
|
|
1605
|
-
}
|
|
1606
|
-
|
|
1607
|
-
i += nm - 1;
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
1611
|
-
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
1612
|
-
}
|
|
1613
|
-
#else
|
|
1614
|
-
for (uint32_t i = 0; i < ids.size(); ++i) {
|
|
1615
|
-
const uint32_t id = ids[i];
|
|
1616
|
-
|
|
1617
|
-
if (i == id || id == ids.size()) {
|
|
1618
|
-
continue;
|
|
1619
|
-
}
|
|
1620
|
-
|
|
1621
|
-
uint32_t nm = 1;
|
|
1622
|
-
|
|
1623
|
-
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
|
1624
|
-
nm++;
|
|
1625
|
-
}
|
|
1626
|
-
|
|
1627
|
-
for (const auto & layer : layers) {
|
|
1628
|
-
const uint32_t il = layer.il;
|
|
1629
|
-
|
|
1630
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
|
1631
|
-
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
|
1632
|
-
|
|
1633
|
-
ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
|
|
1634
|
-
n_embd_k_gqa, nm,
|
|
1635
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
1636
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa*i));
|
|
1637
|
-
|
|
1638
|
-
ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
|
|
1639
|
-
n_embd_k_gqa, nm,
|
|
1640
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
1641
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa*id));
|
|
1642
|
-
|
|
1643
|
-
ggml_tensor * view_v_src;
|
|
1644
|
-
ggml_tensor * view_v_dst;
|
|
1645
|
-
|
|
1646
|
-
if (cparams.flash_attn) {
|
|
1647
|
-
// NOTE: the V cache is not transposed when using flash attention
|
|
1648
|
-
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
1649
|
-
n_embd_v_gqa, nm,
|
|
1650
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
1651
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa*i));
|
|
1652
|
-
|
|
1653
|
-
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
1654
|
-
n_embd_v_gqa, nm,
|
|
1655
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
1656
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa*id));
|
|
1657
|
-
} else {
|
|
1658
|
-
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
1659
|
-
nm, n_embd_v_gqa,
|
|
1660
|
-
ggml_row_size(layer.v->type, cells.size()),
|
|
1661
|
-
ggml_row_size(layer.v->type, i));
|
|
1662
|
-
|
|
1663
|
-
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
1664
|
-
nm, n_embd_v_gqa,
|
|
1665
|
-
ggml_row_size(layer.v->type, cells.size()),
|
|
1666
|
-
ggml_row_size(layer.v->type, id));
|
|
1667
|
-
}
|
|
1668
|
-
|
|
1669
|
-
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
|
1670
|
-
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
|
|
1671
|
-
}
|
|
1672
|
-
|
|
1673
|
-
i += nm - 1;
|
|
1674
|
-
}
|
|
1675
|
-
|
|
1676
|
-
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
|
1677
|
-
#endif
|
|
1678
|
-
|
|
1679
|
-
return gf;
|
|
1680
|
-
}
|
|
1681
|
-
|
|
1682
|
-
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
|
|
1683
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
1684
|
-
|
|
1685
|
-
const auto & cells = v_cells[0];
|
|
1686
|
-
|
|
1687
|
-
const uint32_t n_layer = layers.size();
|
|
1688
|
-
|
|
1689
|
-
const uint32_t n_kv = cells.used_max_p1();
|
|
1690
|
-
const uint32_t n_used = cells.get_used();
|
|
1691
|
-
|
|
1692
|
-
assert(n_used <= n_kv);
|
|
1693
|
-
|
|
1694
|
-
//const int64_t t_start = ggml_time_us();
|
|
1695
|
-
|
|
1696
|
-
// number of cells moved
|
|
1697
|
-
uint32_t n_moves = 0;
|
|
1698
|
-
|
|
1699
|
-
// each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
|
|
1700
|
-
// - source view, destination view, copy operation
|
|
1701
|
-
// - x2 for keys and values
|
|
1702
|
-
//const uint32_t max_moves = max_nodes()/(6*n_layer);
|
|
1703
|
-
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
|
1704
|
-
const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
|
1705
|
-
|
|
1706
|
-
// determine which KV cells to move where
|
|
1707
|
-
defrag_info res;
|
|
1708
|
-
auto & ids = res.ids;
|
|
1709
|
-
|
|
1710
|
-
ids.resize(n_kv, n_kv);
|
|
1711
|
-
|
|
1712
|
-
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
|
1713
|
-
if (!cells.is_empty(i0)) {
|
|
1714
|
-
ids[i0] = i0;
|
|
1715
|
-
|
|
1716
|
-
continue;
|
|
1717
|
-
}
|
|
1718
|
-
|
|
1719
|
-
// found a hole - fill it with data from the end of the cache
|
|
1720
|
-
|
|
1721
|
-
uint32_t nh = 1;
|
|
1722
|
-
|
|
1723
|
-
// determine the size of the hole
|
|
1724
|
-
while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
|
|
1725
|
-
nh++;
|
|
1726
|
-
}
|
|
1727
|
-
|
|
1728
|
-
uint32_t nf = 0;
|
|
1729
|
-
uint32_t is = n_kv - 1;
|
|
1730
|
-
|
|
1731
|
-
// starting from the end, find nh non-empty cells
|
|
1732
|
-
for (; is > i0; --is) {
|
|
1733
|
-
if (cells.is_empty(is) || ids[is] != n_kv) {
|
|
1734
|
-
continue;
|
|
1735
|
-
}
|
|
1736
|
-
|
|
1737
|
-
// non-empty cell which is not yet moved
|
|
1738
|
-
nf++;
|
|
1739
|
-
|
|
1740
|
-
if (nf == nh) {
|
|
1741
|
-
break;
|
|
1742
|
-
}
|
|
1743
|
-
}
|
|
1744
|
-
|
|
1745
|
-
// this can only happen if `n_used` is not accurate, which would be a bug
|
|
1746
|
-
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
|
1747
|
-
|
|
1748
|
-
nf = 0;
|
|
1749
|
-
|
|
1750
|
-
uint32_t i1 = is;
|
|
1751
|
-
|
|
1752
|
-
// are we moving a continuous block of memory?
|
|
1753
|
-
bool cont = false;
|
|
1754
|
-
|
|
1755
|
-
// should we stop searching for the next move?
|
|
1756
|
-
bool stop = false;
|
|
1757
|
-
|
|
1758
|
-
// go back and move the nf cells to the hole
|
|
1759
|
-
for (; i1 < n_kv; ++i1) {
|
|
1760
|
-
if (cells.is_empty(i1) || ids[i1] != n_kv) {
|
|
1761
|
-
if (n_moves == max_moves) {
|
|
1762
|
-
stop = true;
|
|
1763
|
-
break;
|
|
1764
|
-
}
|
|
1765
|
-
|
|
1766
|
-
cont = false;
|
|
1767
|
-
continue;
|
|
1768
|
-
}
|
|
1769
|
-
|
|
1770
|
-
// this cell goes to (i0 + nf)
|
|
1771
|
-
ids[i1] = i0 + nf;
|
|
1772
|
-
|
|
1773
|
-
if (!cont) {
|
|
1774
|
-
n_moves++;
|
|
1775
|
-
cont = true;
|
|
1776
|
-
}
|
|
1777
|
-
|
|
1778
|
-
nf++;
|
|
1779
|
-
|
|
1780
|
-
if (nf == nh) {
|
|
1781
|
-
break;
|
|
1782
|
-
}
|
|
1783
|
-
}
|
|
1784
|
-
|
|
1785
|
-
if (stop || n_moves == max_moves) {
|
|
1786
|
-
break;
|
|
1787
|
-
}
|
|
1788
|
-
|
|
1789
|
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
|
1790
|
-
|
|
1791
|
-
i0 += nh - 1;
|
|
1792
|
-
}
|
|
1793
|
-
|
|
1794
|
-
if (n_moves == 0) {
|
|
1795
|
-
return {};
|
|
1796
|
-
}
|
|
1797
|
-
|
|
1798
|
-
LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
|
1799
|
-
|
|
1800
|
-
LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
|
1801
|
-
|
|
1802
|
-
return res;
|
|
1803
|
-
}
|
|
1804
|
-
|
|
1805
|
-
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1455
|
+
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1806
1456
|
assert(p0 >= 0 && p1 >= 0);
|
|
1807
1457
|
|
|
1808
1458
|
switch (swa_type) {
|
|
@@ -1828,7 +1478,7 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
|
1828
1478
|
return false;
|
|
1829
1479
|
}
|
|
1830
1480
|
|
|
1831
|
-
void
|
|
1481
|
+
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1832
1482
|
GGML_UNUSED(flags);
|
|
1833
1483
|
|
|
1834
1484
|
io.write(&n_stream, sizeof(n_stream));
|
|
@@ -1881,7 +1531,7 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
1881
1531
|
}
|
|
1882
1532
|
}
|
|
1883
1533
|
|
|
1884
|
-
void
|
|
1534
|
+
void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1885
1535
|
GGML_UNUSED(flags);
|
|
1886
1536
|
|
|
1887
1537
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
@@ -1917,7 +1567,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
|
|
|
1917
1567
|
}
|
|
1918
1568
|
}
|
|
1919
1569
|
|
|
1920
|
-
void
|
|
1570
|
+
void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
|
|
1921
1571
|
const auto & cells = v_cells[cr.strm];
|
|
1922
1572
|
|
|
1923
1573
|
for (const auto & range : cr.data) {
|
|
@@ -1945,7 +1595,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_
|
|
|
1945
1595
|
}
|
|
1946
1596
|
}
|
|
1947
1597
|
|
|
1948
|
-
void
|
|
1598
|
+
void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
|
|
1949
1599
|
const auto & cells = v_cells[cr.strm];
|
|
1950
1600
|
|
|
1951
1601
|
const uint32_t v_trans = this->v_trans ? 1 : 0;
|
|
@@ -2040,7 +1690,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_
|
|
|
2040
1690
|
}
|
|
2041
1691
|
}
|
|
2042
1692
|
|
|
2043
|
-
bool
|
|
1693
|
+
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
2044
1694
|
auto & cells = v_cells[strm];
|
|
2045
1695
|
auto & head = v_heads[strm];
|
|
2046
1696
|
|
|
@@ -2137,7 +1787,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm
|
|
|
2137
1787
|
return true;
|
|
2138
1788
|
}
|
|
2139
1789
|
|
|
2140
|
-
bool
|
|
1790
|
+
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
|
2141
1791
|
auto & cells = v_cells[strm];
|
|
2142
1792
|
auto & head = v_heads[strm];
|
|
2143
1793
|
|
|
@@ -2274,13 +1924,13 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm
|
|
|
2274
1924
|
}
|
|
2275
1925
|
|
|
2276
1926
|
//
|
|
2277
|
-
//
|
|
1927
|
+
// llama_kv_cache_context
|
|
2278
1928
|
//
|
|
2279
1929
|
|
|
2280
|
-
|
|
1930
|
+
llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {}
|
|
2281
1931
|
|
|
2282
|
-
|
|
2283
|
-
|
|
1932
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1933
|
+
llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
|
|
2284
1934
|
n_kv = kv->get_size();
|
|
2285
1935
|
|
|
2286
1936
|
const uint32_t n_stream = kv->get_n_stream();
|
|
@@ -2296,26 +1946,25 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
|
|
|
2296
1946
|
}
|
|
2297
1947
|
}
|
|
2298
1948
|
|
|
2299
|
-
|
|
2300
|
-
|
|
1949
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1950
|
+
llama_kv_cache * kv,
|
|
2301
1951
|
llama_context * lctx,
|
|
2302
1952
|
bool do_shift,
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
|
|
1953
|
+
stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
|
|
1954
|
+
if (!do_shift && this->sc_info.empty()) {
|
|
2306
1955
|
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
|
2307
1956
|
}
|
|
2308
1957
|
}
|
|
2309
1958
|
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
1959
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1960
|
+
llama_kv_cache * kv,
|
|
1961
|
+
llama_kv_cache::slot_info_vec_t sinfos,
|
|
2313
1962
|
std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
|
|
2314
1963
|
}
|
|
2315
1964
|
|
|
2316
|
-
|
|
1965
|
+
llama_kv_cache_context::~llama_kv_cache_context() = default;
|
|
2317
1966
|
|
|
2318
|
-
bool
|
|
1967
|
+
bool llama_kv_cache_context::next() {
|
|
2319
1968
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
2320
1969
|
|
|
2321
1970
|
if (++i_cur >= ubatches.size()) {
|
|
@@ -2325,12 +1974,12 @@ bool llama_kv_cache_unified_context::next() {
|
|
|
2325
1974
|
return true;
|
|
2326
1975
|
}
|
|
2327
1976
|
|
|
2328
|
-
bool
|
|
1977
|
+
bool llama_kv_cache_context::apply() {
|
|
2329
1978
|
assert(!llama_memory_status_is_fail(status));
|
|
2330
1979
|
|
|
2331
1980
|
// no ubatches -> this is a KV cache update
|
|
2332
1981
|
if (ubatches.empty()) {
|
|
2333
|
-
kv->update(lctx, do_shift,
|
|
1982
|
+
kv->update(lctx, do_shift, sc_info);
|
|
2334
1983
|
|
|
2335
1984
|
return true;
|
|
2336
1985
|
}
|
|
@@ -2342,69 +1991,69 @@ bool llama_kv_cache_unified_context::apply() {
|
|
|
2342
1991
|
return true;
|
|
2343
1992
|
}
|
|
2344
1993
|
|
|
2345
|
-
llama_memory_status
|
|
1994
|
+
llama_memory_status llama_kv_cache_context::get_status() const {
|
|
2346
1995
|
return status;
|
|
2347
1996
|
}
|
|
2348
1997
|
|
|
2349
|
-
const llama_ubatch &
|
|
1998
|
+
const llama_ubatch & llama_kv_cache_context::get_ubatch() const {
|
|
2350
1999
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
2351
2000
|
|
|
2352
2001
|
return ubatches[i_cur];
|
|
2353
2002
|
}
|
|
2354
2003
|
|
|
2355
|
-
uint32_t
|
|
2004
|
+
uint32_t llama_kv_cache_context::get_n_kv() const {
|
|
2356
2005
|
return n_kv;
|
|
2357
2006
|
}
|
|
2358
2007
|
|
|
2359
|
-
bool
|
|
2008
|
+
bool llama_kv_cache_context::get_supports_set_rows() const {
|
|
2360
2009
|
return kv->get_supports_set_rows();
|
|
2361
2010
|
}
|
|
2362
2011
|
|
|
2363
|
-
ggml_tensor *
|
|
2012
|
+
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
|
2364
2013
|
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
|
2365
2014
|
}
|
|
2366
2015
|
|
|
2367
|
-
ggml_tensor *
|
|
2016
|
+
ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
|
|
2368
2017
|
return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
|
|
2369
2018
|
}
|
|
2370
2019
|
|
|
2371
|
-
ggml_tensor *
|
|
2020
|
+
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
|
2372
2021
|
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
|
2373
2022
|
}
|
|
2374
2023
|
|
|
2375
|
-
ggml_tensor *
|
|
2024
|
+
ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
|
|
2376
2025
|
return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
|
|
2377
2026
|
}
|
|
2378
2027
|
|
|
2379
|
-
ggml_tensor *
|
|
2028
|
+
ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
2380
2029
|
return kv->build_input_k_idxs(ctx, ubatch);
|
|
2381
2030
|
}
|
|
2382
2031
|
|
|
2383
|
-
ggml_tensor *
|
|
2032
|
+
ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
2384
2033
|
return kv->build_input_v_idxs(ctx, ubatch);
|
|
2385
2034
|
}
|
|
2386
2035
|
|
|
2387
|
-
void
|
|
2036
|
+
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
|
|
2388
2037
|
kv->set_input_k_shift(dst);
|
|
2389
2038
|
}
|
|
2390
2039
|
|
|
2391
|
-
void
|
|
2040
|
+
void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2392
2041
|
kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
|
|
2393
2042
|
}
|
|
2394
2043
|
|
|
2395
|
-
void
|
|
2044
|
+
void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2396
2045
|
kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
|
|
2397
2046
|
}
|
|
2398
2047
|
|
|
2399
|
-
void
|
|
2048
|
+
void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
2400
2049
|
kv->set_input_kq_mask(dst, ubatch, causal_attn);
|
|
2401
2050
|
}
|
|
2402
2051
|
|
|
2403
|
-
void
|
|
2052
|
+
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2404
2053
|
kv->set_input_pos_bucket(dst, ubatch);
|
|
2405
2054
|
}
|
|
2406
2055
|
|
|
2407
|
-
uint32_t
|
|
2056
|
+
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
|
|
2408
2057
|
// the FA kernels require padding to avoid extra runtime boundary checks
|
|
2409
2058
|
return cparams.flash_attn ? 256u : 32u;
|
|
2410
2059
|
}
|