@fugood/llama.node 1.1.9 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +7 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +20 -2
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +153 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
- package/src/llama.cpp/src/llama-kv-cache.h +16 -28
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
- package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
- package/src/llama.cpp/src/llama-memory.h +8 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +302 -31
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
|
@@ -17,32 +17,25 @@
|
|
|
17
17
|
//
|
|
18
18
|
|
|
19
19
|
llama_kv_cache::llama_kv_cache(
|
|
20
|
-
const llama_model &
|
|
21
|
-
|
|
22
|
-
ggml_type
|
|
23
|
-
|
|
24
|
-
bool
|
|
25
|
-
bool
|
|
26
|
-
|
|
27
|
-
uint32_t
|
|
28
|
-
uint32_t
|
|
29
|
-
uint32_t
|
|
30
|
-
|
|
31
|
-
|
|
20
|
+
const llama_model & model,
|
|
21
|
+
ggml_type type_k,
|
|
22
|
+
ggml_type type_v,
|
|
23
|
+
bool v_trans,
|
|
24
|
+
bool offload,
|
|
25
|
+
bool unified,
|
|
26
|
+
uint32_t kv_size,
|
|
27
|
+
uint32_t n_seq_max,
|
|
28
|
+
uint32_t n_pad,
|
|
29
|
+
uint32_t n_swa,
|
|
30
|
+
llama_swa_type swa_type,
|
|
31
|
+
const layer_filter_cb & filter,
|
|
32
|
+
const layer_reuse_cb & reuse) :
|
|
32
33
|
model(model), hparams(model.hparams), v_trans(v_trans),
|
|
33
34
|
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
|
34
35
|
|
|
35
36
|
GGML_ASSERT(kv_size % n_pad == 0);
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
auto n_layer_cache = hparams.n_layer;
|
|
39
|
-
if (model.arch == LLM_ARCH_GEMMA3N) {
|
|
40
|
-
n_layer_cache = 20;
|
|
41
|
-
}
|
|
42
|
-
if (model.arch == LLM_ARCH_GLM4_MOE) {
|
|
43
|
-
// GLM-4.5: Only process up to last layer, skip final NextN layer
|
|
44
|
-
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
|
|
45
|
-
}
|
|
38
|
+
const uint32_t n_layer_kv = hparams.n_layer_kv();
|
|
46
39
|
|
|
47
40
|
// create a context for each buffer type
|
|
48
41
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -50,7 +43,7 @@ llama_kv_cache::llama_kv_cache(
|
|
|
50
43
|
auto it = ctx_map.find(buft);
|
|
51
44
|
if (it == ctx_map.end()) {
|
|
52
45
|
ggml_init_params params = {
|
|
53
|
-
/*.mem_size =*/ size_t(2u*(1 + n_stream)*
|
|
46
|
+
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
|
|
54
47
|
/*.mem_buffer =*/ NULL,
|
|
55
48
|
/*.no_alloc =*/ true,
|
|
56
49
|
};
|
|
@@ -97,9 +90,14 @@ llama_kv_cache::llama_kv_cache(
|
|
|
97
90
|
__func__, hparams.n_embd_v_gqa_max());
|
|
98
91
|
}
|
|
99
92
|
|
|
100
|
-
for (uint32_t il = 0; il <
|
|
93
|
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
|
94
|
+
if (!hparams.has_kv(il)) {
|
|
95
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
|
|
101
99
|
if (filter && !filter(il)) {
|
|
102
|
-
LLAMA_LOG_DEBUG("%s: layer %3d:
|
|
100
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
|
|
103
101
|
continue;
|
|
104
102
|
}
|
|
105
103
|
|
|
@@ -147,23 +145,27 @@ llama_kv_cache::llama_kv_cache(
|
|
|
147
145
|
layers.push_back({ il, k, v, k_stream, v_stream, });
|
|
148
146
|
}
|
|
149
147
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
|
|
148
|
+
if (reuse) {
|
|
149
|
+
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
|
|
153
150
|
|
|
154
|
-
for (uint32_t il =
|
|
155
|
-
|
|
156
|
-
|
|
151
|
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
|
152
|
+
const int32_t il_reuse = reuse(il);
|
|
153
|
+
|
|
154
|
+
if (il_reuse < 0) {
|
|
155
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
|
|
157
156
|
continue;
|
|
158
157
|
}
|
|
159
158
|
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
if (filter && !filter(il)) {
|
|
160
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
162
163
|
|
|
163
164
|
GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
|
|
165
|
+
|
|
164
166
|
map_layer_ids[il] = map_layer_ids[il_reuse];
|
|
165
167
|
|
|
166
|
-
LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d,
|
|
168
|
+
LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
|
|
167
169
|
}
|
|
168
170
|
}
|
|
169
171
|
|
|
@@ -195,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
|
|
|
195
197
|
|
|
196
198
|
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
|
197
199
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
|
198
|
-
|
|
199
|
-
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
200
|
-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
|
201
|
-
|
|
202
|
-
if (!supports_set_rows) {
|
|
203
|
-
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
204
|
-
GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
if (!supports_set_rows) {
|
|
208
|
-
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
|
|
209
|
-
}
|
|
210
200
|
}
|
|
211
201
|
|
|
212
202
|
void llama_kv_cache::clear(bool data) {
|
|
@@ -549,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|
|
549
539
|
bool success = true;
|
|
550
540
|
|
|
551
541
|
for (const auto & ubatch : ubatches) {
|
|
552
|
-
// non-continuous slots require support for ggml_set_rows()
|
|
553
|
-
const bool cont = supports_set_rows ? false : true;
|
|
554
|
-
|
|
555
542
|
// only find a suitable slot for the ubatch. don't modify the cells yet
|
|
556
|
-
const auto sinfo_new = find_slot(ubatch,
|
|
543
|
+
const auto sinfo_new = find_slot(ubatch, false);
|
|
557
544
|
if (sinfo_new.empty()) {
|
|
558
545
|
success = false;
|
|
559
546
|
break;
|
|
@@ -769,8 +756,8 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
|
769
756
|
GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
|
|
770
757
|
}
|
|
771
758
|
|
|
772
|
-
res.s0 = std::min<
|
|
773
|
-
res.s1 = std::max<
|
|
759
|
+
res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
|
|
760
|
+
res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
|
|
774
761
|
|
|
775
762
|
res.strm[s] = seq_to_stream[seq_id];
|
|
776
763
|
res.idxs[s].reserve(n_tokens);
|
|
@@ -962,11 +949,11 @@ bool llama_kv_cache::get_has_shift() const {
|
|
|
962
949
|
return result;
|
|
963
950
|
}
|
|
964
951
|
|
|
965
|
-
uint32_t llama_kv_cache::get_n_kv() const {
|
|
952
|
+
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
|
966
953
|
uint32_t result = 0;
|
|
967
954
|
|
|
968
|
-
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
969
|
-
const auto & cells = v_cells[s];
|
|
955
|
+
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
|
|
956
|
+
const auto & cells = v_cells[sinfo.strm[s]];
|
|
970
957
|
|
|
971
958
|
result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
|
|
972
959
|
}
|
|
@@ -974,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
|
|
|
974
961
|
return result;
|
|
975
962
|
}
|
|
976
963
|
|
|
977
|
-
bool llama_kv_cache::get_supports_set_rows() const {
|
|
978
|
-
return supports_set_rows;
|
|
979
|
-
}
|
|
980
|
-
|
|
981
964
|
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
|
|
982
965
|
const int32_t ikv = map_layer_ids.at(il);
|
|
983
966
|
|
|
@@ -1015,52 +998,42 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|
|
1015
998
|
// note: v->nb[1] <= v->nb[2]
|
|
1016
999
|
return ggml_view_4d(ctx, v,
|
|
1017
1000
|
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
|
|
1018
|
-
ggml_row_size(v->type, hparams.n_embd_head_v),
|
|
1019
|
-
ggml_row_size(v->type, n_embd_v_gqa),
|
|
1020
|
-
ggml_row_size(v->type, n_embd_v_gqa*kv_size),
|
|
1001
|
+
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
|
1002
|
+
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
|
1003
|
+
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
|
1021
1004
|
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
|
1022
1005
|
}
|
|
1023
1006
|
|
|
1024
1007
|
// note: v->nb[1] > v->nb[2]
|
|
1025
1008
|
return ggml_view_4d(ctx, v,
|
|
1026
1009
|
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
|
|
1027
|
-
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v),
|
|
1028
|
-
ggml_row_size(v->type, kv_size),
|
|
1029
|
-
ggml_row_size(v->type, kv_size*n_embd_v_gqa),
|
|
1010
|
+
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
|
|
1011
|
+
ggml_row_size(v->type, kv_size), // v->nb[2]
|
|
1012
|
+
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
|
1030
1013
|
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
|
1031
1014
|
}
|
|
1032
1015
|
|
|
1033
1016
|
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1017
|
+
GGML_UNUSED(sinfo);
|
|
1018
|
+
|
|
1034
1019
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1035
1020
|
|
|
1036
1021
|
auto * k = layers[ikv].k;
|
|
1037
1022
|
|
|
1038
|
-
const int64_t n_embd_k_gqa = k->ne[0];
|
|
1039
1023
|
const int64_t n_tokens = k_cur->ne[2];
|
|
1040
1024
|
|
|
1041
1025
|
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
|
|
1042
1026
|
|
|
1043
|
-
if (
|
|
1044
|
-
|
|
1045
|
-
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
|
|
1046
|
-
}
|
|
1047
|
-
|
|
1048
|
-
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1027
|
+
if (k->ne[2] > 1) {
|
|
1028
|
+
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
|
|
1049
1029
|
}
|
|
1050
1030
|
|
|
1051
|
-
|
|
1052
|
-
// will be removed when ggml_set_rows() is adopted by all backends
|
|
1053
|
-
|
|
1054
|
-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
|
|
1055
|
-
|
|
1056
|
-
ggml_tensor * k_view = ggml_view_1d(ctx, k,
|
|
1057
|
-
n_tokens*n_embd_k_gqa,
|
|
1058
|
-
ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
|
|
1059
|
-
|
|
1060
|
-
return ggml_cpy(ctx, k_cur, k_view);
|
|
1031
|
+
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1061
1032
|
}
|
|
1062
1033
|
|
|
1063
1034
|
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
|
|
1035
|
+
GGML_UNUSED(sinfo);
|
|
1036
|
+
|
|
1064
1037
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1065
1038
|
|
|
1066
1039
|
auto * v = layers[ikv].v;
|
|
@@ -1070,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
|
|
1070
1043
|
|
|
1071
1044
|
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
|
|
1072
1045
|
|
|
1073
|
-
if (
|
|
1074
|
-
if (
|
|
1075
|
-
|
|
1076
|
-
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
|
|
1077
|
-
}
|
|
1078
|
-
|
|
1079
|
-
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1080
|
-
}
|
|
1081
|
-
|
|
1082
|
-
// [TAG_V_CACHE_VARIABLE]
|
|
1083
|
-
if (n_embd_v_gqa < v->ne[0]) {
|
|
1084
|
-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
|
|
1046
|
+
if (!v_trans) {
|
|
1047
|
+
if (v->ne[2] > 1) {
|
|
1048
|
+
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
|
|
1085
1049
|
}
|
|
1086
1050
|
|
|
1087
|
-
|
|
1088
|
-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
|
|
1089
|
-
|
|
1090
|
-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
|
|
1091
|
-
|
|
1092
|
-
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1051
|
+
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1093
1052
|
}
|
|
1094
1053
|
|
|
1095
|
-
//
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
ggml_tensor * v_view = nullptr;
|
|
1054
|
+
// [TAG_V_CACHE_VARIABLE]
|
|
1055
|
+
if (n_embd_v_gqa < v->ne[0]) {
|
|
1056
|
+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
|
|
1057
|
+
}
|
|
1101
1058
|
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
n_tokens*n_embd_v_gqa,
|
|
1105
|
-
ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
|
|
1106
|
-
} else {
|
|
1107
|
-
v_cur = ggml_transpose(ctx, v_cur);
|
|
1059
|
+
// the row becomes a single element
|
|
1060
|
+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
|
|
1108
1061
|
|
|
1109
|
-
|
|
1110
|
-
(v->ne[1] )*ggml_element_size(v),
|
|
1111
|
-
(sinfo.head())*ggml_element_size(v));
|
|
1112
|
-
}
|
|
1062
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
|
|
1113
1063
|
|
|
1114
|
-
return
|
|
1064
|
+
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1115
1065
|
}
|
|
1116
1066
|
|
|
1117
1067
|
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
|
|
@@ -1141,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
|
|
|
1141
1091
|
}
|
|
1142
1092
|
|
|
1143
1093
|
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1144
|
-
if (!supports_set_rows) {
|
|
1145
|
-
return;
|
|
1146
|
-
}
|
|
1147
|
-
|
|
1148
1094
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1149
1095
|
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
|
1150
1096
|
|
|
@@ -1161,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
|
|
|
1161
1107
|
}
|
|
1162
1108
|
|
|
1163
1109
|
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
|
1164
|
-
if (!supports_set_rows) {
|
|
1165
|
-
return;
|
|
1166
|
-
}
|
|
1167
|
-
|
|
1168
1110
|
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1169
1111
|
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
|
1170
1112
|
|
|
@@ -1983,8 +1925,7 @@ bool llama_kv_cache_context::apply() {
|
|
|
1983
1925
|
}
|
|
1984
1926
|
|
|
1985
1927
|
kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
|
|
1986
|
-
|
|
1987
|
-
n_kv = kv->get_n_kv();
|
|
1928
|
+
n_kv = kv->get_n_kv(sinfos[i_cur]);
|
|
1988
1929
|
|
|
1989
1930
|
return true;
|
|
1990
1931
|
}
|
|
@@ -2003,10 +1944,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
|
|
|
2003
1944
|
return n_kv;
|
|
2004
1945
|
}
|
|
2005
1946
|
|
|
2006
|
-
bool llama_kv_cache_context::get_supports_set_rows() const {
|
|
2007
|
-
return kv->get_supports_set_rows();
|
|
2008
|
-
}
|
|
2009
|
-
|
|
2010
1947
|
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
|
2011
1948
|
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
|
2012
1949
|
}
|
|
@@ -21,9 +21,6 @@ class llama_kv_cache : public llama_memory_i {
|
|
|
21
21
|
public:
|
|
22
22
|
static uint32_t get_padding(const llama_cparams & cparams);
|
|
23
23
|
|
|
24
|
-
// this callback is used to filter out layers that should not be included in the cache
|
|
25
|
-
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
26
|
-
|
|
27
24
|
struct stream_copy_info {
|
|
28
25
|
bool empty() const {
|
|
29
26
|
assert(ssrc.size() == sdst.size());
|
|
@@ -41,8 +38,8 @@ public:
|
|
|
41
38
|
using idx_vec_t = std::vector<uint32_t>;
|
|
42
39
|
|
|
43
40
|
// number of streams: ns = s1 - s0 + 1
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
uint32_t s0;
|
|
42
|
+
uint32_t s1;
|
|
46
43
|
|
|
47
44
|
std::vector<llama_seq_id> strm; // [ns]
|
|
48
45
|
std::vector<idx_vec_t> idxs; // [ns]
|
|
@@ -82,18 +79,19 @@ public:
|
|
|
82
79
|
using slot_info_vec_t = std::vector<slot_info>;
|
|
83
80
|
|
|
84
81
|
llama_kv_cache(
|
|
85
|
-
const llama_model &
|
|
86
|
-
|
|
87
|
-
ggml_type
|
|
88
|
-
|
|
89
|
-
bool
|
|
90
|
-
bool
|
|
91
|
-
|
|
92
|
-
uint32_t
|
|
93
|
-
uint32_t
|
|
94
|
-
uint32_t
|
|
95
|
-
|
|
96
|
-
|
|
82
|
+
const llama_model & model,
|
|
83
|
+
ggml_type type_k,
|
|
84
|
+
ggml_type type_v,
|
|
85
|
+
bool v_trans,
|
|
86
|
+
bool offload,
|
|
87
|
+
bool unified,
|
|
88
|
+
uint32_t kv_size,
|
|
89
|
+
uint32_t n_seq_max,
|
|
90
|
+
uint32_t n_pad,
|
|
91
|
+
uint32_t n_swa,
|
|
92
|
+
llama_swa_type swa_type,
|
|
93
|
+
const layer_filter_cb & filter,
|
|
94
|
+
const layer_reuse_cb & reuse);
|
|
97
95
|
|
|
98
96
|
~llama_kv_cache() = default;
|
|
99
97
|
|
|
@@ -141,10 +139,7 @@ public:
|
|
|
141
139
|
// graph_build API
|
|
142
140
|
//
|
|
143
141
|
|
|
144
|
-
uint32_t get_n_kv() const;
|
|
145
|
-
|
|
146
|
-
// TODO: temporary
|
|
147
|
-
bool get_supports_set_rows() const;
|
|
142
|
+
uint32_t get_n_kv(const slot_info & sinfo) const;
|
|
148
143
|
|
|
149
144
|
// get views of the current state of the cache
|
|
150
145
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
@@ -217,10 +212,6 @@ private:
|
|
|
217
212
|
// env: LLAMA_KV_CACHE_DEBUG
|
|
218
213
|
int debug = 0;
|
|
219
214
|
|
|
220
|
-
// env: LLAMA_SET_ROWS (temporary)
|
|
221
|
-
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
222
|
-
bool supports_set_rows = true;
|
|
223
|
-
|
|
224
215
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
225
216
|
|
|
226
217
|
std::vector<ggml_context_ptr> ctxs;
|
|
@@ -320,9 +311,6 @@ public:
|
|
|
320
311
|
|
|
321
312
|
uint32_t get_n_kv() const;
|
|
322
313
|
|
|
323
|
-
// TODO: temporary
|
|
324
|
-
bool get_supports_set_rows() const;
|
|
325
|
-
|
|
326
314
|
// get views of the current state of the cache
|
|
327
315
|
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
|
328
316
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
@@ -9,32 +9,29 @@
|
|
|
9
9
|
//
|
|
10
10
|
|
|
11
11
|
llama_memory_hybrid::llama_memory_hybrid(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
12
|
+
const llama_model & model,
|
|
13
|
+
/* attn */
|
|
14
|
+
ggml_type type_k,
|
|
15
|
+
ggml_type type_v,
|
|
16
|
+
bool v_trans,
|
|
17
|
+
uint32_t kv_size,
|
|
18
|
+
uint32_t n_pad,
|
|
19
|
+
uint32_t n_swa,
|
|
20
|
+
llama_swa_type swa_type,
|
|
21
|
+
/* recurrent */
|
|
22
|
+
ggml_type type_r,
|
|
23
|
+
ggml_type type_s,
|
|
24
|
+
uint32_t rs_size,
|
|
25
|
+
/* common */
|
|
26
|
+
uint32_t n_seq_max,
|
|
27
|
+
bool offload,
|
|
28
|
+
bool unified,
|
|
29
|
+
/* layer filters */
|
|
30
|
+
const layer_filter_cb & filter_attn,
|
|
31
|
+
const layer_filter_cb & filter_recr) :
|
|
32
32
|
hparams(model.hparams),
|
|
33
33
|
mem_attn(new llama_kv_cache(
|
|
34
34
|
model,
|
|
35
|
-
filter_attn == nullptr ?
|
|
36
|
-
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
|
37
|
-
: filter_attn,
|
|
38
35
|
type_k,
|
|
39
36
|
type_v,
|
|
40
37
|
v_trans,
|
|
@@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|
|
44
41
|
n_seq_max,
|
|
45
42
|
n_pad,
|
|
46
43
|
n_swa,
|
|
47
|
-
swa_type
|
|
44
|
+
swa_type,
|
|
45
|
+
filter_attn == nullptr ?
|
|
46
|
+
[&](int32_t il) { return !hparams.is_recurrent(il); }
|
|
47
|
+
: filter_attn,
|
|
48
|
+
nullptr
|
|
48
49
|
)),
|
|
49
50
|
mem_recr(new llama_memory_recurrent(
|
|
50
51
|
model,
|
|
51
|
-
filter_recr == nullptr ?
|
|
52
|
-
[&](int32_t il) { return hparams.is_recurrent(il); }
|
|
53
|
-
: filter_recr,
|
|
54
52
|
type_r,
|
|
55
53
|
type_s,
|
|
56
54
|
offload,
|
|
57
55
|
rs_size,
|
|
58
|
-
n_seq_max
|
|
56
|
+
n_seq_max,
|
|
57
|
+
filter_recr == nullptr ?
|
|
58
|
+
[&](int32_t il) { return hparams.is_recurrent(il); }
|
|
59
|
+
: filter_recr
|
|
59
60
|
)) {}
|
|
60
61
|
|
|
61
62
|
llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
@@ -18,31 +18,27 @@
|
|
|
18
18
|
|
|
19
19
|
class llama_memory_hybrid : public llama_memory_i {
|
|
20
20
|
public:
|
|
21
|
-
|
|
22
|
-
// this callback is used to filter out layers that should not be included in the cache
|
|
23
|
-
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
24
|
-
|
|
25
21
|
llama_memory_hybrid(
|
|
26
22
|
const llama_model & model,
|
|
27
23
|
/* attn */
|
|
28
|
-
ggml_type
|
|
29
|
-
ggml_type
|
|
30
|
-
bool
|
|
31
|
-
uint32_t
|
|
32
|
-
uint32_t
|
|
33
|
-
uint32_t
|
|
34
|
-
llama_swa_type
|
|
35
|
-
|
|
36
|
-
ggml_type
|
|
37
|
-
ggml_type
|
|
38
|
-
uint32_t
|
|
39
|
-
|
|
40
|
-
uint32_t
|
|
41
|
-
bool
|
|
42
|
-
bool
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
24
|
+
ggml_type type_k,
|
|
25
|
+
ggml_type type_v,
|
|
26
|
+
bool v_trans,
|
|
27
|
+
uint32_t kv_size,
|
|
28
|
+
uint32_t n_pad,
|
|
29
|
+
uint32_t n_swa,
|
|
30
|
+
llama_swa_type swa_type,
|
|
31
|
+
/* recurrent */
|
|
32
|
+
ggml_type type_r,
|
|
33
|
+
ggml_type type_s,
|
|
34
|
+
uint32_t rs_size,
|
|
35
|
+
/* common */
|
|
36
|
+
uint32_t n_seq_max,
|
|
37
|
+
bool offload,
|
|
38
|
+
bool unified,
|
|
39
|
+
/* layer filters */
|
|
40
|
+
const layer_filter_cb & filter_attn = nullptr,
|
|
41
|
+
const layer_filter_cb & filter_recr = nullptr);
|
|
46
42
|
|
|
47
43
|
~llama_memory_hybrid() = default;
|
|
48
44
|
|
|
@@ -16,13 +16,13 @@
|
|
|
16
16
|
//
|
|
17
17
|
|
|
18
18
|
llama_memory_recurrent::llama_memory_recurrent(
|
|
19
|
-
const llama_model &
|
|
20
|
-
|
|
21
|
-
ggml_type
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
uint32_t
|
|
25
|
-
|
|
19
|
+
const llama_model & model,
|
|
20
|
+
ggml_type type_r,
|
|
21
|
+
ggml_type type_s,
|
|
22
|
+
bool offload,
|
|
23
|
+
uint32_t mem_size,
|
|
24
|
+
uint32_t n_seq_max,
|
|
25
|
+
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
|
26
26
|
const int32_t n_layer = hparams.n_layer;
|
|
27
27
|
|
|
28
28
|
head = 0;
|
|
@@ -15,18 +15,14 @@
|
|
|
15
15
|
// see the implementation of llama_kv_cache_context_i for an example how to do it
|
|
16
16
|
class llama_memory_recurrent : public llama_memory_i {
|
|
17
17
|
public:
|
|
18
|
-
|
|
19
|
-
// this callback is used to filter out layers that should not be included in the cache
|
|
20
|
-
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
21
|
-
|
|
22
18
|
llama_memory_recurrent(
|
|
23
|
-
const llama_model &
|
|
24
|
-
|
|
25
|
-
ggml_type
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
uint32_t
|
|
29
|
-
|
|
19
|
+
const llama_model & model,
|
|
20
|
+
ggml_type type_r,
|
|
21
|
+
ggml_type type_s,
|
|
22
|
+
bool offload,
|
|
23
|
+
uint32_t mem_size,
|
|
24
|
+
uint32_t n_seq_max,
|
|
25
|
+
const layer_filter_cb & filter);
|
|
30
26
|
|
|
31
27
|
~llama_memory_recurrent() = default;
|
|
32
28
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
5
|
#include <memory>
|
|
6
|
+
#include <functional>
|
|
6
7
|
|
|
7
8
|
struct llama_ubatch;
|
|
8
9
|
|
|
@@ -64,6 +65,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
|
|
|
64
65
|
// general concept of LLM memory
|
|
65
66
|
// the KV cache is a type of LLM memory, but there can be other types
|
|
66
67
|
struct llama_memory_i {
|
|
68
|
+
// this callback is used to filter out layers that should not be included in the cache
|
|
69
|
+
using layer_filter_cb = std::function<bool(int32_t il)>;
|
|
70
|
+
|
|
71
|
+
// this callback is used to specify which layers should reuse memory from other layers
|
|
72
|
+
// return negative value to indicate that the layer il should not reuse memory
|
|
73
|
+
using layer_reuse_cb = std::function<int32_t(int32_t il)>;
|
|
74
|
+
|
|
67
75
|
virtual ~llama_memory_i() = default;
|
|
68
76
|
|
|
69
77
|
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
|
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|
|
788
788
|
}
|
|
789
789
|
|
|
790
790
|
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
|
791
|
+
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
|
791
792
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
792
793
|
|
|
793
794
|
if (cur == NULL) {
|