@fugood/llama.node 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/lib/binding.ts +7 -1
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +15 -5
  4. package/src/LlamaCompletionWorker.cpp +12 -3
  5. package/src/LlamaCompletionWorker.h +3 -1
  6. package/src/LlamaContext.cpp +20 -2
  7. package/src/llama.cpp/common/arg.cpp +29 -19
  8. package/src/llama.cpp/common/chat.cpp +153 -3
  9. package/src/llama.cpp/common/chat.h +1 -0
  10. package/src/llama.cpp/common/common.cpp +10 -3
  11. package/src/llama.cpp/common/common.h +4 -1
  12. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  20. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  21. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  23. package/src/llama.cpp/include/llama.h +27 -1
  24. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  25. package/src/llama.cpp/src/llama-adapter.h +3 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  27. package/src/llama.cpp/src/llama-arch.h +4 -0
  28. package/src/llama.cpp/src/llama-context.cpp +80 -39
  29. package/src/llama.cpp/src/llama-context.h +0 -4
  30. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  31. package/src/llama.cpp/src/llama-graph.h +2 -1
  32. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  33. package/src/llama.cpp/src/llama-hparams.h +6 -0
  34. package/src/llama.cpp/src/llama-impl.h +2 -0
  35. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
  36. package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
  37. package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
  38. package/src/llama.cpp/src/llama-kv-cache.h +16 -28
  39. package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
  40. package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
  41. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  42. package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
  43. package/src/llama.cpp/src/llama-memory.h +8 -0
  44. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  45. package/src/llama.cpp/src/llama-model.cpp +302 -31
  46. package/src/llama.cpp/src/llama-model.h +1 -0
  47. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  48. package/src/llama.cpp/src/llama.cpp +12 -0
@@ -17,32 +17,25 @@
17
17
  //
18
18
 
19
19
  llama_kv_cache::llama_kv_cache(
20
- const llama_model & model,
21
- layer_filter_cb && filter,
22
- ggml_type type_k,
23
- ggml_type type_v,
24
- bool v_trans,
25
- bool offload,
26
- bool unified,
27
- uint32_t kv_size,
28
- uint32_t n_seq_max,
29
- uint32_t n_pad,
30
- uint32_t n_swa,
31
- llama_swa_type swa_type) :
20
+ const llama_model & model,
21
+ ggml_type type_k,
22
+ ggml_type type_v,
23
+ bool v_trans,
24
+ bool offload,
25
+ bool unified,
26
+ uint32_t kv_size,
27
+ uint32_t n_seq_max,
28
+ uint32_t n_pad,
29
+ uint32_t n_swa,
30
+ llama_swa_type swa_type,
31
+ const layer_filter_cb & filter,
32
+ const layer_reuse_cb & reuse) :
32
33
  model(model), hparams(model.hparams), v_trans(v_trans),
33
34
  n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
34
35
 
35
36
  GGML_ASSERT(kv_size % n_pad == 0);
36
37
 
37
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
38
- auto n_layer_cache = hparams.n_layer;
39
- if (model.arch == LLM_ARCH_GEMMA3N) {
40
- n_layer_cache = 20;
41
- }
42
- if (model.arch == LLM_ARCH_GLM4_MOE) {
43
- // GLM-4.5: Only process up to last layer, skip final NextN layer
44
- n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
- }
38
+ const uint32_t n_layer_kv = hparams.n_layer_kv();
46
39
 
47
40
  // create a context for each buffer type
48
41
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +43,7 @@ llama_kv_cache::llama_kv_cache(
50
43
  auto it = ctx_map.find(buft);
51
44
  if (it == ctx_map.end()) {
52
45
  ggml_init_params params = {
53
- /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()),
46
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
54
47
  /*.mem_buffer =*/ NULL,
55
48
  /*.no_alloc =*/ true,
56
49
  };
@@ -97,9 +90,14 @@ llama_kv_cache::llama_kv_cache(
97
90
  __func__, hparams.n_embd_v_gqa_max());
98
91
  }
99
92
 
100
- for (uint32_t il = 0; il < n_layer_cache; il++) {
93
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
94
+ if (!hparams.has_kv(il)) {
95
+ LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
96
+ continue;
97
+ }
98
+
101
99
  if (filter && !filter(il)) {
102
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
100
+ LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
103
101
  continue;
104
102
  }
105
103
 
@@ -147,23 +145,27 @@ llama_kv_cache::llama_kv_cache(
147
145
  layers.push_back({ il, k, v, k_stream, v_stream, });
148
146
  }
149
147
 
150
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
151
- if (model.arch == LLM_ARCH_GEMMA3N) {
152
- LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
148
+ if (reuse) {
149
+ LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
153
150
 
154
- for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
155
- if (filter && !filter(il)) {
156
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
151
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
152
+ const int32_t il_reuse = reuse(il);
153
+
154
+ if (il_reuse < 0) {
155
+ LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
157
156
  continue;
158
157
  }
159
158
 
160
- const bool is_swa = hparams.is_swa(il);
161
- const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
159
+ if (filter && !filter(il)) {
160
+ LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
161
+ continue;
162
+ }
162
163
 
163
164
  GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
165
+
164
166
  map_layer_ids[il] = map_layer_ids[il_reuse];
165
167
 
166
- LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
168
+ LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
167
169
  }
168
170
  }
169
171
 
@@ -195,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
195
197
 
196
198
  const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
197
199
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
198
-
199
- const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
200
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
201
-
202
- if (!supports_set_rows) {
203
- // ref: https://github.com/ggml-org/llama.cpp/pull/14363
204
- GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
205
- }
206
-
207
- if (!supports_set_rows) {
208
- LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
209
- }
210
200
  }
211
201
 
212
202
  void llama_kv_cache::clear(bool data) {
@@ -549,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
549
539
  bool success = true;
550
540
 
551
541
  for (const auto & ubatch : ubatches) {
552
- // non-continuous slots require support for ggml_set_rows()
553
- const bool cont = supports_set_rows ? false : true;
554
-
555
542
  // only find a suitable slot for the ubatch. don't modify the cells yet
556
- const auto sinfo_new = find_slot(ubatch, cont);
543
+ const auto sinfo_new = find_slot(ubatch, false);
557
544
  if (sinfo_new.empty()) {
558
545
  success = false;
559
546
  break;
@@ -769,8 +756,8 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
769
756
  GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
770
757
  }
771
758
 
772
- res.s0 = std::min<llama_seq_id>(res.s0, seq_to_stream[seq_id]);
773
- res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);
759
+ res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
760
+ res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
774
761
 
775
762
  res.strm[s] = seq_to_stream[seq_id];
776
763
  res.idxs[s].reserve(n_tokens);
@@ -962,11 +949,11 @@ bool llama_kv_cache::get_has_shift() const {
962
949
  return result;
963
950
  }
964
951
 
965
- uint32_t llama_kv_cache::get_n_kv() const {
952
+ uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
966
953
  uint32_t result = 0;
967
954
 
968
- for (uint32_t s = 0; s < n_stream; ++s) {
969
- const auto & cells = v_cells[s];
955
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
956
+ const auto & cells = v_cells[sinfo.strm[s]];
970
957
 
971
958
  result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
972
959
  }
@@ -974,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
974
961
  return result;
975
962
  }
976
963
 
977
- bool llama_kv_cache::get_supports_set_rows() const {
978
- return supports_set_rows;
979
- }
980
-
981
964
  ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
982
965
  const int32_t ikv = map_layer_ids.at(il);
983
966
 
@@ -1015,52 +998,42 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
1015
998
  // note: v->nb[1] <= v->nb[2]
1016
999
  return ggml_view_4d(ctx, v,
1017
1000
  hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
1018
- ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
1019
- ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
1020
- ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
1001
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
1002
+ ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
1003
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
1021
1004
  ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
1022
1005
  }
1023
1006
 
1024
1007
  // note: v->nb[1] > v->nb[2]
1025
1008
  return ggml_view_4d(ctx, v,
1026
1009
  n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
1027
- ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
1028
- ggml_row_size(v->type, kv_size), // v->nb[2]
1029
- ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
1010
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
1011
+ ggml_row_size(v->type, kv_size), // v->nb[2]
1012
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
1030
1013
  ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
1031
1014
  }
1032
1015
 
1033
1016
  ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1017
+ GGML_UNUSED(sinfo);
1018
+
1034
1019
  const int32_t ikv = map_layer_ids.at(il);
1035
1020
 
1036
1021
  auto * k = layers[ikv].k;
1037
1022
 
1038
- const int64_t n_embd_k_gqa = k->ne[0];
1039
1023
  const int64_t n_tokens = k_cur->ne[2];
1040
1024
 
1041
1025
  k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
1042
1026
 
1043
- if (k_idxs && supports_set_rows) {
1044
- if (k->ne[2] > 1) {
1045
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1046
- }
1047
-
1048
- return ggml_set_rows(ctx, k, k_cur, k_idxs);
1027
+ if (k->ne[2] > 1) {
1028
+ k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1049
1029
  }
1050
1030
 
1051
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1052
- // will be removed when ggml_set_rows() is adopted by all backends
1053
-
1054
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1055
-
1056
- ggml_tensor * k_view = ggml_view_1d(ctx, k,
1057
- n_tokens*n_embd_k_gqa,
1058
- ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
1059
-
1060
- return ggml_cpy(ctx, k_cur, k_view);
1031
+ return ggml_set_rows(ctx, k, k_cur, k_idxs);
1061
1032
  }
1062
1033
 
1063
1034
  ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1035
+ GGML_UNUSED(sinfo);
1036
+
1064
1037
  const int32_t ikv = map_layer_ids.at(il);
1065
1038
 
1066
1039
  auto * v = layers[ikv].v;
@@ -1070,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1070
1043
 
1071
1044
  v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
1072
1045
 
1073
- if (v_idxs && supports_set_rows) {
1074
- if (!v_trans) {
1075
- if (v->ne[2] > 1) {
1076
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1077
- }
1078
-
1079
- return ggml_set_rows(ctx, v, v_cur, v_idxs);
1080
- }
1081
-
1082
- // [TAG_V_CACHE_VARIABLE]
1083
- if (n_embd_v_gqa < v->ne[0]) {
1084
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1046
+ if (!v_trans) {
1047
+ if (v->ne[2] > 1) {
1048
+ v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1085
1049
  }
1086
1050
 
1087
- // the row becomes a single element
1088
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1089
-
1090
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1091
-
1092
- return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1051
+ return ggml_set_rows(ctx, v, v_cur, v_idxs);
1093
1052
  }
1094
1053
 
1095
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1096
- // will be removed when ggml_set_rows() is adopted by all backends
1097
-
1098
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1099
-
1100
- ggml_tensor * v_view = nullptr;
1054
+ // [TAG_V_CACHE_VARIABLE]
1055
+ if (n_embd_v_gqa < v->ne[0]) {
1056
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1057
+ }
1101
1058
 
1102
- if (!v_trans) {
1103
- v_view = ggml_view_1d(ctx, v,
1104
- n_tokens*n_embd_v_gqa,
1105
- ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
1106
- } else {
1107
- v_cur = ggml_transpose(ctx, v_cur);
1059
+ // the row becomes a single element
1060
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1108
1061
 
1109
- v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
1110
- (v->ne[1] )*ggml_element_size(v),
1111
- (sinfo.head())*ggml_element_size(v));
1112
- }
1062
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1113
1063
 
1114
- return ggml_cpy(ctx, v_cur, v_view);
1064
+ return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1115
1065
  }
1116
1066
 
1117
1067
  ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
@@ -1141,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
1141
1091
  }
1142
1092
 
1143
1093
  void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1144
- if (!supports_set_rows) {
1145
- return;
1146
- }
1147
-
1148
1094
  const uint32_t n_tokens = ubatch->n_tokens;
1149
1095
  GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1150
1096
 
@@ -1161,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
1161
1107
  }
1162
1108
 
1163
1109
  void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1164
- if (!supports_set_rows) {
1165
- return;
1166
- }
1167
-
1168
1110
  const uint32_t n_tokens = ubatch->n_tokens;
1169
1111
  GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1170
1112
 
@@ -1983,8 +1925,7 @@ bool llama_kv_cache_context::apply() {
1983
1925
  }
1984
1926
 
1985
1927
  kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
1986
-
1987
- n_kv = kv->get_n_kv();
1928
+ n_kv = kv->get_n_kv(sinfos[i_cur]);
1988
1929
 
1989
1930
  return true;
1990
1931
  }
@@ -2003,10 +1944,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
2003
1944
  return n_kv;
2004
1945
  }
2005
1946
 
2006
- bool llama_kv_cache_context::get_supports_set_rows() const {
2007
- return kv->get_supports_set_rows();
2008
- }
2009
-
2010
1947
  ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
2011
1948
  return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
2012
1949
  }
@@ -21,9 +21,6 @@ class llama_kv_cache : public llama_memory_i {
21
21
  public:
22
22
  static uint32_t get_padding(const llama_cparams & cparams);
23
23
 
24
- // this callback is used to filter out layers that should not be included in the cache
25
- using layer_filter_cb = std::function<bool(int32_t il)>;
26
-
27
24
  struct stream_copy_info {
28
25
  bool empty() const {
29
26
  assert(ssrc.size() == sdst.size());
@@ -41,8 +38,8 @@ public:
41
38
  using idx_vec_t = std::vector<uint32_t>;
42
39
 
43
40
  // number of streams: ns = s1 - s0 + 1
44
- llama_seq_id s0;
45
- llama_seq_id s1;
41
+ uint32_t s0;
42
+ uint32_t s1;
46
43
 
47
44
  std::vector<llama_seq_id> strm; // [ns]
48
45
  std::vector<idx_vec_t> idxs; // [ns]
@@ -82,18 +79,19 @@ public:
82
79
  using slot_info_vec_t = std::vector<slot_info>;
83
80
 
84
81
  llama_kv_cache(
85
- const llama_model & model,
86
- layer_filter_cb && filter,
87
- ggml_type type_k,
88
- ggml_type type_v,
89
- bool v_trans,
90
- bool offload,
91
- bool unified,
92
- uint32_t kv_size,
93
- uint32_t n_seq_max,
94
- uint32_t n_pad,
95
- uint32_t n_swa,
96
- llama_swa_type swa_type);
82
+ const llama_model & model,
83
+ ggml_type type_k,
84
+ ggml_type type_v,
85
+ bool v_trans,
86
+ bool offload,
87
+ bool unified,
88
+ uint32_t kv_size,
89
+ uint32_t n_seq_max,
90
+ uint32_t n_pad,
91
+ uint32_t n_swa,
92
+ llama_swa_type swa_type,
93
+ const layer_filter_cb & filter,
94
+ const layer_reuse_cb & reuse);
97
95
 
98
96
  ~llama_kv_cache() = default;
99
97
 
@@ -141,10 +139,7 @@ public:
141
139
  // graph_build API
142
140
  //
143
141
 
144
- uint32_t get_n_kv() const;
145
-
146
- // TODO: temporary
147
- bool get_supports_set_rows() const;
142
+ uint32_t get_n_kv(const slot_info & sinfo) const;
148
143
 
149
144
  // get views of the current state of the cache
150
145
  ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@@ -217,10 +212,6 @@ private:
217
212
  // env: LLAMA_KV_CACHE_DEBUG
218
213
  int debug = 0;
219
214
 
220
- // env: LLAMA_SET_ROWS (temporary)
221
- // ref: https://github.com/ggml-org/llama.cpp/pull/14285
222
- bool supports_set_rows = true;
223
-
224
215
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
225
216
 
226
217
  std::vector<ggml_context_ptr> ctxs;
@@ -320,9 +311,6 @@ public:
320
311
 
321
312
  uint32_t get_n_kv() const;
322
313
 
323
- // TODO: temporary
324
- bool get_supports_set_rows() const;
325
-
326
314
  // get views of the current state of the cache
327
315
  ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
328
316
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
@@ -9,32 +9,29 @@
9
9
  //
10
10
 
11
11
  llama_memory_hybrid::llama_memory_hybrid(
12
- const llama_model & model,
13
- /* attn */
14
- ggml_type type_k,
15
- ggml_type type_v,
16
- bool v_trans,
17
- uint32_t kv_size,
18
- uint32_t n_pad,
19
- uint32_t n_swa,
20
- llama_swa_type swa_type,
21
- /* recurrent */
22
- ggml_type type_r,
23
- ggml_type type_s,
24
- uint32_t rs_size,
25
- /* common */
26
- uint32_t n_seq_max,
27
- bool offload,
28
- bool unified,
29
- /* layer filters */
30
- layer_filter_cb && filter_attn,
31
- layer_filter_cb && filter_recr) :
12
+ const llama_model & model,
13
+ /* attn */
14
+ ggml_type type_k,
15
+ ggml_type type_v,
16
+ bool v_trans,
17
+ uint32_t kv_size,
18
+ uint32_t n_pad,
19
+ uint32_t n_swa,
20
+ llama_swa_type swa_type,
21
+ /* recurrent */
22
+ ggml_type type_r,
23
+ ggml_type type_s,
24
+ uint32_t rs_size,
25
+ /* common */
26
+ uint32_t n_seq_max,
27
+ bool offload,
28
+ bool unified,
29
+ /* layer filters */
30
+ const layer_filter_cb & filter_attn,
31
+ const layer_filter_cb & filter_recr) :
32
32
  hparams(model.hparams),
33
33
  mem_attn(new llama_kv_cache(
34
34
  model,
35
- filter_attn == nullptr ?
36
- [&](int32_t il) { return !hparams.is_recurrent(il); }
37
- : filter_attn,
38
35
  type_k,
39
36
  type_v,
40
37
  v_trans,
@@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
44
41
  n_seq_max,
45
42
  n_pad,
46
43
  n_swa,
47
- swa_type
44
+ swa_type,
45
+ filter_attn == nullptr ?
46
+ [&](int32_t il) { return !hparams.is_recurrent(il); }
47
+ : filter_attn,
48
+ nullptr
48
49
  )),
49
50
  mem_recr(new llama_memory_recurrent(
50
51
  model,
51
- filter_recr == nullptr ?
52
- [&](int32_t il) { return hparams.is_recurrent(il); }
53
- : filter_recr,
54
52
  type_r,
55
53
  type_s,
56
54
  offload,
57
55
  rs_size,
58
- n_seq_max
56
+ n_seq_max,
57
+ filter_recr == nullptr ?
58
+ [&](int32_t il) { return hparams.is_recurrent(il); }
59
+ : filter_recr
59
60
  )) {}
60
61
 
61
62
  llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
@@ -18,31 +18,27 @@
18
18
 
19
19
  class llama_memory_hybrid : public llama_memory_i {
20
20
  public:
21
-
22
- // this callback is used to filter out layers that should not be included in the cache
23
- using layer_filter_cb = std::function<bool(int32_t il)>;
24
-
25
21
  llama_memory_hybrid(
26
22
  const llama_model & model,
27
23
  /* attn */
28
- ggml_type type_k,
29
- ggml_type type_v,
30
- bool v_trans,
31
- uint32_t kv_size,
32
- uint32_t n_pad,
33
- uint32_t n_swa,
34
- llama_swa_type swa_type,
35
- /* recurrent */
36
- ggml_type type_r,
37
- ggml_type type_s,
38
- uint32_t rs_size,
39
- /* common */
40
- uint32_t n_seq_max,
41
- bool offload,
42
- bool unified,
43
- /* layer filters */
44
- layer_filter_cb && filter_attn = nullptr,
45
- layer_filter_cb && filter_recr = nullptr);
24
+ ggml_type type_k,
25
+ ggml_type type_v,
26
+ bool v_trans,
27
+ uint32_t kv_size,
28
+ uint32_t n_pad,
29
+ uint32_t n_swa,
30
+ llama_swa_type swa_type,
31
+ /* recurrent */
32
+ ggml_type type_r,
33
+ ggml_type type_s,
34
+ uint32_t rs_size,
35
+ /* common */
36
+ uint32_t n_seq_max,
37
+ bool offload,
38
+ bool unified,
39
+ /* layer filters */
40
+ const layer_filter_cb & filter_attn = nullptr,
41
+ const layer_filter_cb & filter_recr = nullptr);
46
42
 
47
43
  ~llama_memory_hybrid() = default;
48
44
 
@@ -16,13 +16,13 @@
16
16
  //
17
17
 
18
18
  llama_memory_recurrent::llama_memory_recurrent(
19
- const llama_model & model,
20
- layer_filter_cb && filter,
21
- ggml_type type_r,
22
- ggml_type type_s,
23
- bool offload,
24
- uint32_t mem_size,
25
- uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
19
+ const llama_model & model,
20
+ ggml_type type_r,
21
+ ggml_type type_s,
22
+ bool offload,
23
+ uint32_t mem_size,
24
+ uint32_t n_seq_max,
25
+ const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
26
26
  const int32_t n_layer = hparams.n_layer;
27
27
 
28
28
  head = 0;
@@ -15,18 +15,14 @@
15
15
  // see the implementation of llama_kv_cache_context_i for an example how to do it
16
16
  class llama_memory_recurrent : public llama_memory_i {
17
17
  public:
18
-
19
- // this callback is used to filter out layers that should not be included in the cache
20
- using layer_filter_cb = std::function<bool(int32_t il)>;
21
-
22
18
  llama_memory_recurrent(
23
- const llama_model & model,
24
- layer_filter_cb && filter,
25
- ggml_type type_r,
26
- ggml_type type_s,
27
- bool offload,
28
- uint32_t mem_size,
29
- uint32_t n_seq_max);
19
+ const llama_model & model,
20
+ ggml_type type_r,
21
+ ggml_type type_s,
22
+ bool offload,
23
+ uint32_t mem_size,
24
+ uint32_t n_seq_max,
25
+ const layer_filter_cb & filter);
30
26
 
31
27
  ~llama_memory_recurrent() = default;
32
28
 
@@ -3,6 +3,7 @@
3
3
  #include "llama.h"
4
4
 
5
5
  #include <memory>
6
+ #include <functional>
6
7
 
7
8
  struct llama_ubatch;
8
9
 
@@ -64,6 +65,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
64
65
  // general concept of LLM memory
65
66
  // the KV cache is a type of LLM memory, but there can be other types
66
67
  struct llama_memory_i {
68
+ // this callback is used to filter out layers that should not be included in the cache
69
+ using layer_filter_cb = std::function<bool(int32_t il)>;
70
+
71
+ // this callback is used to specify which layers should reuse memory from other layers
72
+ // return negative value to indicate that the layer il should not reuse memory
73
+ using layer_reuse_cb = std::function<int32_t(int32_t il)>;
74
+
67
75
  virtual ~llama_memory_i() = default;
68
76
 
69
77
  // split the input batch into a set of ubatches and verify that they can fit into the cache
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
788
788
  }
789
789
 
790
790
  struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
791
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
791
792
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
792
793
 
793
794
  if (cur == NULL) {