@fugood/llama.node 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +20 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +13 -4
- package/src/llama.cpp/common/chat.cpp +33 -2
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -197
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +449 -246
- package/src/llama.cpp/src/llama-model.h +2 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#include "llama-kv-cache
|
|
1
|
+
#include "llama-kv-cache.h"
|
|
2
2
|
|
|
3
3
|
#include "llama-impl.h"
|
|
4
4
|
#include "llama-io.h"
|
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
#include <stdexcept>
|
|
14
14
|
|
|
15
15
|
//
|
|
16
|
-
//
|
|
16
|
+
// llama_kv_cache
|
|
17
17
|
//
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
llama_kv_cache::llama_kv_cache(
|
|
20
20
|
const llama_model & model,
|
|
21
21
|
layer_filter_cb && filter,
|
|
22
22
|
ggml_type type_k,
|
|
@@ -209,7 +209,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
209
209
|
}
|
|
210
210
|
}
|
|
211
211
|
|
|
212
|
-
void
|
|
212
|
+
void llama_kv_cache::clear(bool data) {
|
|
213
213
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
214
214
|
v_cells[s].reset();
|
|
215
215
|
v_heads[s] = 0;
|
|
@@ -222,7 +222,7 @@ void llama_kv_cache_unified::clear(bool data) {
|
|
|
222
222
|
}
|
|
223
223
|
}
|
|
224
224
|
|
|
225
|
-
bool
|
|
225
|
+
bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
226
226
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
227
227
|
|
|
228
228
|
if (p0 < 0) {
|
|
@@ -285,7 +285,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
285
285
|
return true;
|
|
286
286
|
}
|
|
287
287
|
|
|
288
|
-
void
|
|
288
|
+
void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
289
289
|
GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
|
|
290
290
|
GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
|
|
291
291
|
|
|
@@ -368,7 +368,7 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
|
|
|
368
368
|
//}
|
|
369
369
|
}
|
|
370
370
|
|
|
371
|
-
void
|
|
371
|
+
void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
|
|
372
372
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
373
373
|
|
|
374
374
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -390,7 +390,7 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
|
|
|
390
390
|
}
|
|
391
391
|
}
|
|
392
392
|
|
|
393
|
-
void
|
|
393
|
+
void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
394
394
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
395
395
|
|
|
396
396
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -434,7 +434,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
434
434
|
head = new_head != cells.size() ? new_head : 0;
|
|
435
435
|
}
|
|
436
436
|
|
|
437
|
-
void
|
|
437
|
+
void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
438
438
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
439
439
|
|
|
440
440
|
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -467,7 +467,7 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
467
467
|
}
|
|
468
468
|
}
|
|
469
469
|
|
|
470
|
-
llama_pos
|
|
470
|
+
llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const {
|
|
471
471
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
472
472
|
|
|
473
473
|
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -475,7 +475,7 @@ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
|
|
|
475
475
|
return cells.seq_pos_min(seq_id);
|
|
476
476
|
}
|
|
477
477
|
|
|
478
|
-
llama_pos
|
|
478
|
+
llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
479
479
|
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
|
|
480
480
|
|
|
481
481
|
const auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
@@ -483,7 +483,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
483
483
|
return cells.seq_pos_max(seq_id);
|
|
484
484
|
}
|
|
485
485
|
|
|
486
|
-
llama_memory_context_ptr
|
|
486
|
+
llama_memory_context_ptr llama_kv_cache::init_batch(
|
|
487
487
|
llama_batch_allocr & balloc,
|
|
488
488
|
uint32_t n_ubatch,
|
|
489
489
|
bool embd_all) {
|
|
@@ -513,62 +513,34 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch(
|
|
|
513
513
|
break;
|
|
514
514
|
}
|
|
515
515
|
|
|
516
|
-
return std::make_unique<
|
|
516
|
+
return std::make_unique<llama_kv_cache_context>(
|
|
517
517
|
this, std::move(sinfos), std::move(ubatches));
|
|
518
518
|
} while (false);
|
|
519
519
|
|
|
520
|
-
return std::make_unique<
|
|
520
|
+
return std::make_unique<llama_kv_cache_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
521
521
|
}
|
|
522
522
|
|
|
523
|
-
llama_memory_context_ptr
|
|
524
|
-
return std::make_unique<
|
|
523
|
+
llama_memory_context_ptr llama_kv_cache::init_full() {
|
|
524
|
+
return std::make_unique<llama_kv_cache_context>(this);
|
|
525
525
|
}
|
|
526
526
|
|
|
527
|
-
llama_memory_context_ptr
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
defrag_info dinfo;
|
|
531
|
-
|
|
532
|
-
// see if we need to defrag
|
|
533
|
-
if (n_stream == 1) {
|
|
534
|
-
// note : for now do not consider defrag for n_stream > 1
|
|
535
|
-
const auto & cells = v_cells[seq_to_stream[0]];
|
|
536
|
-
|
|
537
|
-
bool do_defrag = optimize;
|
|
538
|
-
|
|
539
|
-
const auto thold = lctx->get_cparams().defrag_thold;
|
|
540
|
-
|
|
541
|
-
if (!do_defrag && thold > 0.0f) {
|
|
542
|
-
const auto n_kv = cells.used_max_p1();
|
|
543
|
-
|
|
544
|
-
// - do not defrag small contexts (i.e. < 2048 tokens)
|
|
545
|
-
// - count the padding towards the number of used tokens
|
|
546
|
-
const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
|
|
547
|
-
|
|
548
|
-
if (fragmentation > thold) {
|
|
549
|
-
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
|
550
|
-
|
|
551
|
-
do_defrag = true;
|
|
552
|
-
}
|
|
553
|
-
}
|
|
527
|
+
llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
|
|
528
|
+
GGML_UNUSED(optimize);
|
|
554
529
|
|
|
555
|
-
|
|
556
|
-
dinfo = defrag_prepare(lctx->graph_max_nodes());
|
|
557
|
-
}
|
|
558
|
-
}
|
|
530
|
+
bool do_shift = get_has_shift();
|
|
559
531
|
|
|
560
|
-
return std::make_unique<
|
|
532
|
+
return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
|
|
561
533
|
}
|
|
562
534
|
|
|
563
|
-
|
|
564
|
-
|
|
535
|
+
llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
|
|
536
|
+
llama_kv_cache::slot_info_vec_t res;
|
|
565
537
|
|
|
566
538
|
struct state_t {
|
|
567
539
|
slot_info sinfo; // slot info for the ubatch
|
|
568
540
|
|
|
569
541
|
std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
|
|
570
542
|
|
|
571
|
-
std::vector<
|
|
543
|
+
std::vector<llama_kv_cells> v_cells; // copy of the old cells, before placing the ubatch
|
|
572
544
|
};
|
|
573
545
|
|
|
574
546
|
// remember the old state of the cells so we can restore it in the end
|
|
@@ -629,7 +601,7 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st
|
|
|
629
601
|
return res;
|
|
630
602
|
}
|
|
631
603
|
|
|
632
|
-
bool
|
|
604
|
+
bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
|
|
633
605
|
bool updated = false;
|
|
634
606
|
|
|
635
607
|
auto * sched = lctx->get_sched();
|
|
@@ -699,57 +671,10 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
|
|
699
671
|
}
|
|
700
672
|
}
|
|
701
673
|
|
|
702
|
-
if (!dinfo.empty()) {
|
|
703
|
-
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
|
704
|
-
|
|
705
|
-
// note: for now do not consider defrag for n_stream > 1
|
|
706
|
-
auto & cells = v_cells[seq_to_stream[0]];
|
|
707
|
-
auto & head = v_heads[seq_to_stream[0]];
|
|
708
|
-
|
|
709
|
-
// apply moves:
|
|
710
|
-
{
|
|
711
|
-
const auto n_kv = dinfo.ids.size();
|
|
712
|
-
|
|
713
|
-
for (uint32_t i = 0; i < n_kv; ++i) {
|
|
714
|
-
assert(dinfo.ids[i] <= n_kv);
|
|
715
|
-
|
|
716
|
-
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
|
717
|
-
continue;
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
cells.mv(i, dinfo.ids[i]);
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
// reset the head so we can find the first free slot during the next ubatch
|
|
724
|
-
head = 0;
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
ggml_backend_sched_reset(sched);
|
|
728
|
-
|
|
729
|
-
auto * res = lctx->get_gf_res_reserve();
|
|
730
|
-
|
|
731
|
-
res->reset();
|
|
732
|
-
|
|
733
|
-
auto * gf = build_graph_defrag(res, lctx, dinfo);
|
|
734
|
-
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
|
735
|
-
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
|
736
|
-
return updated;
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
res->set_inputs(nullptr);
|
|
740
|
-
|
|
741
|
-
if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
|
742
|
-
LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
|
|
743
|
-
return updated;
|
|
744
|
-
}
|
|
745
|
-
|
|
746
|
-
updated = true;
|
|
747
|
-
}
|
|
748
|
-
|
|
749
674
|
return updated;
|
|
750
675
|
}
|
|
751
676
|
|
|
752
|
-
|
|
677
|
+
llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const {
|
|
753
678
|
|
|
754
679
|
if (debug > 0) {
|
|
755
680
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
@@ -948,7 +873,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
|
|
|
948
873
|
return res;
|
|
949
874
|
}
|
|
950
875
|
|
|
951
|
-
void
|
|
876
|
+
void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
|
|
952
877
|
// keep track of the max sequence position that we would overwrite with this ubatch
|
|
953
878
|
// for non-SWA cache, this would be always empty
|
|
954
879
|
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
|
|
@@ -1013,21 +938,21 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u
|
|
|
1013
938
|
}
|
|
1014
939
|
}
|
|
1015
940
|
|
|
1016
|
-
bool
|
|
941
|
+
bool llama_kv_cache::get_can_shift() const {
|
|
1017
942
|
return true;
|
|
1018
943
|
}
|
|
1019
944
|
|
|
1020
|
-
uint32_t
|
|
945
|
+
uint32_t llama_kv_cache::get_size() const {
|
|
1021
946
|
const auto & cells = v_cells[seq_to_stream[0]];
|
|
1022
947
|
|
|
1023
948
|
return cells.size();
|
|
1024
949
|
}
|
|
1025
950
|
|
|
1026
|
-
uint32_t
|
|
951
|
+
uint32_t llama_kv_cache::get_n_stream() const {
|
|
1027
952
|
return n_stream;
|
|
1028
953
|
}
|
|
1029
954
|
|
|
1030
|
-
bool
|
|
955
|
+
bool llama_kv_cache::get_has_shift() const {
|
|
1031
956
|
bool result = false;
|
|
1032
957
|
|
|
1033
958
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1037,7 +962,7 @@ bool llama_kv_cache_unified::get_has_shift() const {
|
|
|
1037
962
|
return result;
|
|
1038
963
|
}
|
|
1039
964
|
|
|
1040
|
-
uint32_t
|
|
965
|
+
uint32_t llama_kv_cache::get_n_kv() const {
|
|
1041
966
|
uint32_t result = 0;
|
|
1042
967
|
|
|
1043
968
|
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
@@ -1049,11 +974,11 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
|
|
|
1049
974
|
return result;
|
|
1050
975
|
}
|
|
1051
976
|
|
|
1052
|
-
bool
|
|
977
|
+
bool llama_kv_cache::get_supports_set_rows() const {
|
|
1053
978
|
return supports_set_rows;
|
|
1054
979
|
}
|
|
1055
980
|
|
|
1056
|
-
ggml_tensor *
|
|
981
|
+
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
1057
982
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1058
983
|
|
|
1059
984
|
auto * k = layers[ikv].k;
|
|
@@ -1073,7 +998,7 @@ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint
|
|
|
1073
998
|
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
|
1074
999
|
}
|
|
1075
1000
|
|
|
1076
|
-
ggml_tensor *
|
|
1001
|
+
ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
1077
1002
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1078
1003
|
|
|
1079
1004
|
auto * v = layers[ikv].v;
|
|
@@ -1105,7 +1030,7 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
|
|
|
1105
1030
|
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
|
1106
1031
|
}
|
|
1107
1032
|
|
|
1108
|
-
ggml_tensor *
|
|
1033
|
+
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1109
1034
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1110
1035
|
|
|
1111
1036
|
auto * k = layers[ikv].k;
|
|
@@ -1135,7 +1060,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_
|
|
|
1135
1060
|
return ggml_cpy(ctx, k_cur, k_view);
|
|
1136
1061
|
}
|
|
1137
1062
|
|
|
1138
|
-
ggml_tensor *
|
|
1063
|
+
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1139
1064
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1140
1065
|
|
|
1141
1066
|
auto * v = layers[ikv].v;
|
|
@@ -1189,7 +1114,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
|
|
|
1189
1114
|
return ggml_cpy(ctx, v_cur, v_view);
|
|
1190
1115
|
}
|
|
1191
1116
|
|
|
1192
|
-
ggml_tensor *
|
|
1117
|
+
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
1193
1118
|
const uint32_t n_tokens = ubatch.n_tokens;
|
|
1194
1119
|
|
|
1195
1120
|
ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
|
|
@@ -1199,7 +1124,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, con
|
|
|
1199
1124
|
return k_idxs;
|
|
1200
1125
|
}
|
|
1201
1126
|
|
|
1202
|
-
ggml_tensor *
|
|
1127
|
+
ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
1203
1128
|
const uint32_t n_tokens = ubatch.n_tokens;
|
|
1204
1129
|
|
|
1205
1130
|
ggml_tensor * v_idxs;
|
|
@@ -1215,7 +1140,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, con
|
|
|
1215
1140
|
return v_idxs;
|
|
1216
1141
|
}
|
|
1217
1142
|
|
|
1218
|
-
void
|
|
1143
|
+
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1219
1144
|
if (!supports_set_rows) {
|
|
1220
1145
|
return;
|
|
1221
1146
|
}
|
|
@@ -1235,7 +1160,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba
|
|
|
1235
1160
|
}
|
|
1236
1161
|
}
|
|
1237
1162
|
|
|
1238
|
-
void
|
|
1163
|
+
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1239
1164
|
if (!supports_set_rows) {
|
|
1240
1165
|
return;
|
|
1241
1166
|
}
|
|
@@ -1272,7 +1197,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
|
|
|
1272
1197
|
}
|
|
1273
1198
|
}
|
|
1274
1199
|
|
|
1275
|
-
void
|
|
1200
|
+
void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
|
1276
1201
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
1277
1202
|
|
|
1278
1203
|
int32_t * data = (int32_t *) dst->data;
|
|
@@ -1286,7 +1211,7 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
|
|
|
1286
1211
|
}
|
|
1287
1212
|
}
|
|
1288
1213
|
|
|
1289
|
-
void
|
|
1214
|
+
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
1290
1215
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1291
1216
|
|
|
1292
1217
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
@@ -1358,7 +1283,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
|
|
1358
1283
|
}
|
|
1359
1284
|
}
|
|
1360
1285
|
|
|
1361
|
-
void
|
|
1286
|
+
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
1362
1287
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
1363
1288
|
|
|
1364
1289
|
GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
|
|
@@ -1383,7 +1308,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
|
|
|
1383
1308
|
}
|
|
1384
1309
|
}
|
|
1385
1310
|
|
|
1386
|
-
size_t
|
|
1311
|
+
size_t llama_kv_cache::total_size() const {
|
|
1387
1312
|
size_t size = 0;
|
|
1388
1313
|
|
|
1389
1314
|
for (const auto & buf : bufs) {
|
|
@@ -1393,7 +1318,7 @@ size_t llama_kv_cache_unified::total_size() const {
|
|
|
1393
1318
|
return size;
|
|
1394
1319
|
}
|
|
1395
1320
|
|
|
1396
|
-
size_t
|
|
1321
|
+
size_t llama_kv_cache::size_k_bytes() const {
|
|
1397
1322
|
size_t size_k_bytes = 0;
|
|
1398
1323
|
|
|
1399
1324
|
for (const auto & layer : layers) {
|
|
@@ -1403,7 +1328,7 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
|
|
|
1403
1328
|
return size_k_bytes;
|
|
1404
1329
|
}
|
|
1405
1330
|
|
|
1406
|
-
size_t
|
|
1331
|
+
size_t llama_kv_cache::size_v_bytes() const {
|
|
1407
1332
|
size_t size_v_bytes = 0;
|
|
1408
1333
|
|
|
1409
1334
|
for (const auto & layer : layers) {
|
|
@@ -1413,7 +1338,7 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
|
|
|
1413
1338
|
return size_v_bytes;
|
|
1414
1339
|
}
|
|
1415
1340
|
|
|
1416
|
-
ggml_tensor *
|
|
1341
|
+
ggml_tensor * llama_kv_cache::build_rope_shift(
|
|
1417
1342
|
const llama_cparams & cparams,
|
|
1418
1343
|
ggml_context * ctx,
|
|
1419
1344
|
ggml_tensor * cur,
|
|
@@ -1465,14 +1390,14 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
|
|
|
1465
1390
|
|
|
1466
1391
|
class llm_graph_input_k_shift : public llm_graph_input_i {
|
|
1467
1392
|
public:
|
|
1468
|
-
llm_graph_input_k_shift(const
|
|
1393
|
+
llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {}
|
|
1469
1394
|
virtual ~llm_graph_input_k_shift() = default;
|
|
1470
1395
|
|
|
1471
1396
|
void set_input(const llama_ubatch * ubatch) override;
|
|
1472
1397
|
|
|
1473
1398
|
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
|
|
1474
1399
|
|
|
1475
|
-
const
|
|
1400
|
+
const llama_kv_cache * kv_self;
|
|
1476
1401
|
};
|
|
1477
1402
|
|
|
1478
1403
|
void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|
@@ -1483,7 +1408,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|
|
1483
1408
|
}
|
|
1484
1409
|
}
|
|
1485
1410
|
|
|
1486
|
-
ggml_cgraph *
|
|
1411
|
+
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
|
1487
1412
|
auto * ctx = res->get_ctx();
|
|
1488
1413
|
auto * gf = res->get_gf();
|
|
1489
1414
|
|
|
@@ -1525,284 +1450,7 @@ ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res,
|
|
|
1525
1450
|
return gf;
|
|
1526
1451
|
}
|
|
1527
1452
|
|
|
1528
|
-
|
|
1529
|
-
llm_graph_result * res,
|
|
1530
|
-
llama_context * lctx,
|
|
1531
|
-
const defrag_info & dinfo) const {
|
|
1532
|
-
auto * ctx = res->get_ctx();
|
|
1533
|
-
auto * gf = res->get_gf();
|
|
1534
|
-
|
|
1535
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
1536
|
-
|
|
1537
|
-
const auto & cells = v_cells[0];
|
|
1538
|
-
|
|
1539
|
-
const auto & ids = dinfo.ids;
|
|
1540
|
-
|
|
1541
|
-
const auto & cparams = lctx->get_cparams();
|
|
1542
|
-
|
|
1543
|
-
#if 0
|
|
1544
|
-
// CPU defrag
|
|
1545
|
-
//
|
|
1546
|
-
// TODO: optimizations are possible:
|
|
1547
|
-
// - multiple threads
|
|
1548
|
-
// - avoid copying to the host memory when already there
|
|
1549
|
-
//
|
|
1550
|
-
// likely not worth the effort, as we have ggml_graph based defrag
|
|
1551
|
-
//
|
|
1552
|
-
|
|
1553
|
-
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
1554
|
-
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
1555
|
-
|
|
1556
|
-
const uint32_t kv_size = size;
|
|
1557
|
-
|
|
1558
|
-
std::vector<uint8_t> buf_k;
|
|
1559
|
-
std::vector<uint8_t> buf_v;
|
|
1560
|
-
|
|
1561
|
-
for (uint32_t il = 0; il < n_layer; ++il) {
|
|
1562
|
-
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
|
1563
|
-
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
|
1564
|
-
|
|
1565
|
-
const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
|
1566
|
-
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
|
1567
|
-
|
|
1568
|
-
buf_k.resize(k_size);
|
|
1569
|
-
buf_v.resize(v_size);
|
|
1570
|
-
|
|
1571
|
-
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
1572
|
-
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
1573
|
-
|
|
1574
|
-
// batch move [i, i+nm) to [id, id+nm)
|
|
1575
|
-
// note: cells can move only to a lower index
|
|
1576
|
-
for (uint32_t i = 0; i < n_kv; ++i) {
|
|
1577
|
-
const uint32_t id = ids[i];
|
|
1578
|
-
|
|
1579
|
-
if (i == id || id == n_kv) {
|
|
1580
|
-
continue;
|
|
1581
|
-
}
|
|
1582
|
-
|
|
1583
|
-
uint32_t nm = 1;
|
|
1584
|
-
|
|
1585
|
-
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
|
1586
|
-
nm++;
|
|
1587
|
-
}
|
|
1588
|
-
|
|
1589
|
-
// move keys
|
|
1590
|
-
{
|
|
1591
|
-
const int64_t os = i*k_size_row;
|
|
1592
|
-
const int64_t od = id*k_size_row;
|
|
1593
|
-
|
|
1594
|
-
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
|
1595
|
-
}
|
|
1596
|
-
|
|
1597
|
-
// move values (note: they are transposed)
|
|
1598
|
-
{
|
|
1599
|
-
const int64_t os = i;
|
|
1600
|
-
const int64_t od = id;
|
|
1601
|
-
|
|
1602
|
-
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1603
|
-
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
|
1604
|
-
}
|
|
1605
|
-
}
|
|
1606
|
-
|
|
1607
|
-
i += nm - 1;
|
|
1608
|
-
}
|
|
1609
|
-
|
|
1610
|
-
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
1611
|
-
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
1612
|
-
}
|
|
1613
|
-
#else
|
|
1614
|
-
for (uint32_t i = 0; i < ids.size(); ++i) {
|
|
1615
|
-
const uint32_t id = ids[i];
|
|
1616
|
-
|
|
1617
|
-
if (i == id || id == ids.size()) {
|
|
1618
|
-
continue;
|
|
1619
|
-
}
|
|
1620
|
-
|
|
1621
|
-
uint32_t nm = 1;
|
|
1622
|
-
|
|
1623
|
-
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
|
1624
|
-
nm++;
|
|
1625
|
-
}
|
|
1626
|
-
|
|
1627
|
-
for (const auto & layer : layers) {
|
|
1628
|
-
const uint32_t il = layer.il;
|
|
1629
|
-
|
|
1630
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
|
1631
|
-
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
|
1632
|
-
|
|
1633
|
-
ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
|
|
1634
|
-
n_embd_k_gqa, nm,
|
|
1635
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
1636
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa*i));
|
|
1637
|
-
|
|
1638
|
-
ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
|
|
1639
|
-
n_embd_k_gqa, nm,
|
|
1640
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
1641
|
-
ggml_row_size(layer.k->type, n_embd_k_gqa*id));
|
|
1642
|
-
|
|
1643
|
-
ggml_tensor * view_v_src;
|
|
1644
|
-
ggml_tensor * view_v_dst;
|
|
1645
|
-
|
|
1646
|
-
if (cparams.flash_attn) {
|
|
1647
|
-
// NOTE: the V cache is not transposed when using flash attention
|
|
1648
|
-
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
1649
|
-
n_embd_v_gqa, nm,
|
|
1650
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
1651
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa*i));
|
|
1652
|
-
|
|
1653
|
-
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
1654
|
-
n_embd_v_gqa, nm,
|
|
1655
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
1656
|
-
ggml_row_size(layer.v->type, n_embd_v_gqa*id));
|
|
1657
|
-
} else {
|
|
1658
|
-
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
1659
|
-
nm, n_embd_v_gqa,
|
|
1660
|
-
ggml_row_size(layer.v->type, cells.size()),
|
|
1661
|
-
ggml_row_size(layer.v->type, i));
|
|
1662
|
-
|
|
1663
|
-
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
1664
|
-
nm, n_embd_v_gqa,
|
|
1665
|
-
ggml_row_size(layer.v->type, cells.size()),
|
|
1666
|
-
ggml_row_size(layer.v->type, id));
|
|
1667
|
-
}
|
|
1668
|
-
|
|
1669
|
-
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
|
1670
|
-
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
|
|
1671
|
-
}
|
|
1672
|
-
|
|
1673
|
-
i += nm - 1;
|
|
1674
|
-
}
|
|
1675
|
-
|
|
1676
|
-
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
|
1677
|
-
#endif
|
|
1678
|
-
|
|
1679
|
-
return gf;
|
|
1680
|
-
}
|
|
1681
|
-
|
|
1682
|
-
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
|
|
1683
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
1684
|
-
|
|
1685
|
-
const auto & cells = v_cells[0];
|
|
1686
|
-
|
|
1687
|
-
const uint32_t n_layer = layers.size();
|
|
1688
|
-
|
|
1689
|
-
const uint32_t n_kv = cells.used_max_p1();
|
|
1690
|
-
const uint32_t n_used = cells.get_used();
|
|
1691
|
-
|
|
1692
|
-
assert(n_used <= n_kv);
|
|
1693
|
-
|
|
1694
|
-
//const int64_t t_start = ggml_time_us();
|
|
1695
|
-
|
|
1696
|
-
// number of cells moved
|
|
1697
|
-
uint32_t n_moves = 0;
|
|
1698
|
-
|
|
1699
|
-
// each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
|
|
1700
|
-
// - source view, destination view, copy operation
|
|
1701
|
-
// - x2 for keys and values
|
|
1702
|
-
//const uint32_t max_moves = max_nodes()/(6*n_layer);
|
|
1703
|
-
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
|
1704
|
-
const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
|
1705
|
-
|
|
1706
|
-
// determine which KV cells to move where
|
|
1707
|
-
defrag_info res;
|
|
1708
|
-
auto & ids = res.ids;
|
|
1709
|
-
|
|
1710
|
-
ids.resize(n_kv, n_kv);
|
|
1711
|
-
|
|
1712
|
-
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
|
1713
|
-
if (!cells.is_empty(i0)) {
|
|
1714
|
-
ids[i0] = i0;
|
|
1715
|
-
|
|
1716
|
-
continue;
|
|
1717
|
-
}
|
|
1718
|
-
|
|
1719
|
-
// found a hole - fill it with data from the end of the cache
|
|
1720
|
-
|
|
1721
|
-
uint32_t nh = 1;
|
|
1722
|
-
|
|
1723
|
-
// determine the size of the hole
|
|
1724
|
-
while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
|
|
1725
|
-
nh++;
|
|
1726
|
-
}
|
|
1727
|
-
|
|
1728
|
-
uint32_t nf = 0;
|
|
1729
|
-
uint32_t is = n_kv - 1;
|
|
1730
|
-
|
|
1731
|
-
// starting from the end, find nh non-empty cells
|
|
1732
|
-
for (; is > i0; --is) {
|
|
1733
|
-
if (cells.is_empty(is) || ids[is] != n_kv) {
|
|
1734
|
-
continue;
|
|
1735
|
-
}
|
|
1736
|
-
|
|
1737
|
-
// non-empty cell which is not yet moved
|
|
1738
|
-
nf++;
|
|
1739
|
-
|
|
1740
|
-
if (nf == nh) {
|
|
1741
|
-
break;
|
|
1742
|
-
}
|
|
1743
|
-
}
|
|
1744
|
-
|
|
1745
|
-
// this can only happen if `n_used` is not accurate, which would be a bug
|
|
1746
|
-
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
|
1747
|
-
|
|
1748
|
-
nf = 0;
|
|
1749
|
-
|
|
1750
|
-
uint32_t i1 = is;
|
|
1751
|
-
|
|
1752
|
-
// are we moving a continuous block of memory?
|
|
1753
|
-
bool cont = false;
|
|
1754
|
-
|
|
1755
|
-
// should we stop searching for the next move?
|
|
1756
|
-
bool stop = false;
|
|
1757
|
-
|
|
1758
|
-
// go back and move the nf cells to the hole
|
|
1759
|
-
for (; i1 < n_kv; ++i1) {
|
|
1760
|
-
if (cells.is_empty(i1) || ids[i1] != n_kv) {
|
|
1761
|
-
if (n_moves == max_moves) {
|
|
1762
|
-
stop = true;
|
|
1763
|
-
break;
|
|
1764
|
-
}
|
|
1765
|
-
|
|
1766
|
-
cont = false;
|
|
1767
|
-
continue;
|
|
1768
|
-
}
|
|
1769
|
-
|
|
1770
|
-
// this cell goes to (i0 + nf)
|
|
1771
|
-
ids[i1] = i0 + nf;
|
|
1772
|
-
|
|
1773
|
-
if (!cont) {
|
|
1774
|
-
n_moves++;
|
|
1775
|
-
cont = true;
|
|
1776
|
-
}
|
|
1777
|
-
|
|
1778
|
-
nf++;
|
|
1779
|
-
|
|
1780
|
-
if (nf == nh) {
|
|
1781
|
-
break;
|
|
1782
|
-
}
|
|
1783
|
-
}
|
|
1784
|
-
|
|
1785
|
-
if (stop || n_moves == max_moves) {
|
|
1786
|
-
break;
|
|
1787
|
-
}
|
|
1788
|
-
|
|
1789
|
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
|
1790
|
-
|
|
1791
|
-
i0 += nh - 1;
|
|
1792
|
-
}
|
|
1793
|
-
|
|
1794
|
-
if (n_moves == 0) {
|
|
1795
|
-
return {};
|
|
1796
|
-
}
|
|
1797
|
-
|
|
1798
|
-
LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
|
1799
|
-
|
|
1800
|
-
LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
|
1801
|
-
|
|
1802
|
-
return res;
|
|
1803
|
-
}
|
|
1804
|
-
|
|
1805
|
-
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1453
|
+
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1806
1454
|
assert(p0 >= 0 && p1 >= 0);
|
|
1807
1455
|
|
|
1808
1456
|
switch (swa_type) {
|
|
@@ -1828,7 +1476,7 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
|
1828
1476
|
return false;
|
|
1829
1477
|
}
|
|
1830
1478
|
|
|
1831
|
-
void
|
|
1479
|
+
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1832
1480
|
GGML_UNUSED(flags);
|
|
1833
1481
|
|
|
1834
1482
|
io.write(&n_stream, sizeof(n_stream));
|
|
@@ -1881,7 +1529,7 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
1881
1529
|
}
|
|
1882
1530
|
}
|
|
1883
1531
|
|
|
1884
|
-
void
|
|
1532
|
+
void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
1885
1533
|
GGML_UNUSED(flags);
|
|
1886
1534
|
|
|
1887
1535
|
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
|
@@ -1917,7 +1565,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
|
|
|
1917
1565
|
}
|
|
1918
1566
|
}
|
|
1919
1567
|
|
|
1920
|
-
void
|
|
1568
|
+
void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
|
|
1921
1569
|
const auto & cells = v_cells[cr.strm];
|
|
1922
1570
|
|
|
1923
1571
|
for (const auto & range : cr.data) {
|
|
@@ -1945,7 +1593,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_
|
|
|
1945
1593
|
}
|
|
1946
1594
|
}
|
|
1947
1595
|
|
|
1948
|
-
void
|
|
1596
|
+
void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
|
|
1949
1597
|
const auto & cells = v_cells[cr.strm];
|
|
1950
1598
|
|
|
1951
1599
|
const uint32_t v_trans = this->v_trans ? 1 : 0;
|
|
@@ -2040,7 +1688,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_
|
|
|
2040
1688
|
}
|
|
2041
1689
|
}
|
|
2042
1690
|
|
|
2043
|
-
bool
|
|
1691
|
+
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
|
2044
1692
|
auto & cells = v_cells[strm];
|
|
2045
1693
|
auto & head = v_heads[strm];
|
|
2046
1694
|
|
|
@@ -2137,7 +1785,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm
|
|
|
2137
1785
|
return true;
|
|
2138
1786
|
}
|
|
2139
1787
|
|
|
2140
|
-
bool
|
|
1788
|
+
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
|
2141
1789
|
auto & cells = v_cells[strm];
|
|
2142
1790
|
auto & head = v_heads[strm];
|
|
2143
1791
|
|
|
@@ -2274,13 +1922,13 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm
|
|
|
2274
1922
|
}
|
|
2275
1923
|
|
|
2276
1924
|
//
|
|
2277
|
-
//
|
|
1925
|
+
// llama_kv_cache_context
|
|
2278
1926
|
//
|
|
2279
1927
|
|
|
2280
|
-
|
|
1928
|
+
llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {}
|
|
2281
1929
|
|
|
2282
|
-
|
|
2283
|
-
|
|
1930
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1931
|
+
llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
|
|
2284
1932
|
n_kv = kv->get_size();
|
|
2285
1933
|
|
|
2286
1934
|
const uint32_t n_stream = kv->get_n_stream();
|
|
@@ -2296,26 +1944,25 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
|
|
|
2296
1944
|
}
|
|
2297
1945
|
}
|
|
2298
1946
|
|
|
2299
|
-
|
|
2300
|
-
|
|
1947
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1948
|
+
llama_kv_cache * kv,
|
|
2301
1949
|
llama_context * lctx,
|
|
2302
1950
|
bool do_shift,
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
|
|
1951
|
+
stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
|
|
1952
|
+
if (!do_shift && this->sc_info.empty()) {
|
|
2306
1953
|
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
|
2307
1954
|
}
|
|
2308
1955
|
}
|
|
2309
1956
|
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
1957
|
+
llama_kv_cache_context::llama_kv_cache_context(
|
|
1958
|
+
llama_kv_cache * kv,
|
|
1959
|
+
llama_kv_cache::slot_info_vec_t sinfos,
|
|
2313
1960
|
std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
|
|
2314
1961
|
}
|
|
2315
1962
|
|
|
2316
|
-
|
|
1963
|
+
llama_kv_cache_context::~llama_kv_cache_context() = default;
|
|
2317
1964
|
|
|
2318
|
-
bool
|
|
1965
|
+
bool llama_kv_cache_context::next() {
|
|
2319
1966
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
2320
1967
|
|
|
2321
1968
|
if (++i_cur >= ubatches.size()) {
|
|
@@ -2325,12 +1972,12 @@ bool llama_kv_cache_unified_context::next() {
|
|
|
2325
1972
|
return true;
|
|
2326
1973
|
}
|
|
2327
1974
|
|
|
2328
|
-
bool
|
|
1975
|
+
bool llama_kv_cache_context::apply() {
|
|
2329
1976
|
assert(!llama_memory_status_is_fail(status));
|
|
2330
1977
|
|
|
2331
1978
|
// no ubatches -> this is a KV cache update
|
|
2332
1979
|
if (ubatches.empty()) {
|
|
2333
|
-
kv->update(lctx, do_shift,
|
|
1980
|
+
kv->update(lctx, do_shift, sc_info);
|
|
2334
1981
|
|
|
2335
1982
|
return true;
|
|
2336
1983
|
}
|
|
@@ -2342,69 +1989,69 @@ bool llama_kv_cache_unified_context::apply() {
|
|
|
2342
1989
|
return true;
|
|
2343
1990
|
}
|
|
2344
1991
|
|
|
2345
|
-
llama_memory_status
|
|
1992
|
+
llama_memory_status llama_kv_cache_context::get_status() const {
|
|
2346
1993
|
return status;
|
|
2347
1994
|
}
|
|
2348
1995
|
|
|
2349
|
-
const llama_ubatch &
|
|
1996
|
+
const llama_ubatch & llama_kv_cache_context::get_ubatch() const {
|
|
2350
1997
|
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
2351
1998
|
|
|
2352
1999
|
return ubatches[i_cur];
|
|
2353
2000
|
}
|
|
2354
2001
|
|
|
2355
|
-
uint32_t
|
|
2002
|
+
uint32_t llama_kv_cache_context::get_n_kv() const {
|
|
2356
2003
|
return n_kv;
|
|
2357
2004
|
}
|
|
2358
2005
|
|
|
2359
|
-
bool
|
|
2006
|
+
bool llama_kv_cache_context::get_supports_set_rows() const {
|
|
2360
2007
|
return kv->get_supports_set_rows();
|
|
2361
2008
|
}
|
|
2362
2009
|
|
|
2363
|
-
ggml_tensor *
|
|
2010
|
+
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
|
2364
2011
|
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
|
2365
2012
|
}
|
|
2366
2013
|
|
|
2367
|
-
ggml_tensor *
|
|
2014
|
+
ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
|
|
2368
2015
|
return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
|
|
2369
2016
|
}
|
|
2370
2017
|
|
|
2371
|
-
ggml_tensor *
|
|
2018
|
+
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
|
2372
2019
|
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
|
2373
2020
|
}
|
|
2374
2021
|
|
|
2375
|
-
ggml_tensor *
|
|
2022
|
+
ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
|
|
2376
2023
|
return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
|
|
2377
2024
|
}
|
|
2378
2025
|
|
|
2379
|
-
ggml_tensor *
|
|
2026
|
+
ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
2380
2027
|
return kv->build_input_k_idxs(ctx, ubatch);
|
|
2381
2028
|
}
|
|
2382
2029
|
|
|
2383
|
-
ggml_tensor *
|
|
2030
|
+
ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
2384
2031
|
return kv->build_input_v_idxs(ctx, ubatch);
|
|
2385
2032
|
}
|
|
2386
2033
|
|
|
2387
|
-
void
|
|
2034
|
+
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
|
|
2388
2035
|
kv->set_input_k_shift(dst);
|
|
2389
2036
|
}
|
|
2390
2037
|
|
|
2391
|
-
void
|
|
2038
|
+
void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2392
2039
|
kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
|
|
2393
2040
|
}
|
|
2394
2041
|
|
|
2395
|
-
void
|
|
2042
|
+
void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2396
2043
|
kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
|
|
2397
2044
|
}
|
|
2398
2045
|
|
|
2399
|
-
void
|
|
2046
|
+
void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
2400
2047
|
kv->set_input_kq_mask(dst, ubatch, causal_attn);
|
|
2401
2048
|
}
|
|
2402
2049
|
|
|
2403
|
-
void
|
|
2050
|
+
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2404
2051
|
kv->set_input_pos_bucket(dst, ubatch);
|
|
2405
2052
|
}
|
|
2406
2053
|
|
|
2407
|
-
uint32_t
|
|
2054
|
+
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
|
|
2408
2055
|
// the FA kernels require padding to avoid extra runtime boundary checks
|
|
2409
2056
|
return cparams.flash_attn ? 256u : 32u;
|
|
2410
2057
|
}
|