@novastera-oss/llamarn 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +12 -8
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +46 -65
  13. package/cpp/LlamaCppModel.h +5 -0
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/README.md +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
  17. package/cpp/llama.cpp/common/arg.cpp +8 -6
  18. package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
  19. package/cpp/llama.cpp/common/chat-parser.h +2 -1
  20. package/cpp/llama.cpp/common/chat.cpp +4 -4
  21. package/cpp/llama.cpp/common/common.cpp +2 -0
  22. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  23. package/cpp/llama.cpp/common/json-partial.h +2 -1
  24. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  25. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
  27. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  28. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  30. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
  32. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
  35. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  39. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
  41. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  42. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  43. package/cpp/llama.cpp/include/llama.h +12 -8
  44. package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
  45. package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
  46. package/cpp/llama.cpp/src/llama-batch.h +15 -10
  47. package/cpp/llama.cpp/src/llama-context.cpp +226 -151
  48. package/cpp/llama.cpp/src/llama-context.h +25 -8
  49. package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
  50. package/cpp/llama.cpp/src/llama-graph.h +25 -24
  51. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
  52. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
  53. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
  54. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
  55. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
  56. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
  57. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
  58. package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
  59. package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
  60. package/cpp/llama.cpp/src/llama-memory.h +44 -0
  61. package/cpp/llama.cpp/src/llama-model.cpp +23 -16
  62. package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
  63. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  64. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  65. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  66. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  67. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  68. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  69. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  70. package/cpp/rn-completion.cpp +63 -8
  71. package/cpp/rn-utils.hpp +8 -1
  72. package/ios/include/common/minja/chat-template.hpp +1 -1
  73. package/ios/include/common/minja/minja.hpp +1 -1
  74. package/ios/include/json-schema-to-grammar.h +4 -4
  75. package/ios/include/llama.h +12 -8
  76. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  77. package/ios/libs/llama.xcframework/Info.plist +22 -22
  78. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  79. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
  80. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  81. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
  82. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  83. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  84. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  85. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
  86. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  87. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  88. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  89. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  90. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  91. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
  92. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  93. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
  94. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  95. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
  96. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  97. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  98. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
  99. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  100. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  101. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  102. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
  103. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  104. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
  105. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  106. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  107. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
  108. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
  109. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  110. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  111. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  112. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  113. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
  114. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  115. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
  116. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  117. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  118. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
  119. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
  120. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  121. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  122. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  123. package/package.json +1 -1
@@ -0,0 +1,1132 @@
1
+ #include "llama-kv-cache-recurrent.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-model.h"
6
+
7
+ #include <algorithm>
8
+ #include <cassert>
9
+ #include <limits>
10
+ #include <map>
11
+ #include <stdexcept>
12
+
13
+ //
14
+ // llama_kv_cache_recurrent
15
+ //
16
+
17
+ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
18
+ const llama_model & model,
19
+ ggml_type type_k,
20
+ ggml_type type_v,
21
+ bool offload,
22
+ uint32_t kv_size,
23
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
24
+ const int32_t n_layer = hparams.n_layer;
25
+
26
+ LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
27
+ __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
28
+
29
+ head = 0;
30
+ size = kv_size;
31
+ used = 0;
32
+
33
+ cells.clear();
34
+ cells.resize(kv_size);
35
+
36
+ // create a context for each buffer type
37
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
38
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
39
+ auto it = ctx_map.find(buft);
40
+ if (it == ctx_map.end()) {
41
+ ggml_init_params params = {
42
+ /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
43
+ /*.mem_buffer =*/ NULL,
44
+ /*.no_alloc =*/ true,
45
+ };
46
+
47
+ ggml_context * ctx = ggml_init(params);
48
+ if (!ctx) {
49
+ return nullptr;
50
+ }
51
+
52
+ ctx_map[buft] = ctx;
53
+ ctxs.emplace_back(ctx);
54
+
55
+ return ctx;
56
+ }
57
+
58
+ return it->second;
59
+ };
60
+
61
+ k_l.reserve(n_layer);
62
+ v_l.reserve(n_layer);
63
+
64
+ for (int i = 0; i < n_layer; i++) {
65
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
66
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
67
+
68
+ const char * dev_name = "CPU";
69
+
70
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
71
+
72
+ if (offload) {
73
+ auto * dev = model.dev_layer(i);
74
+ buft = ggml_backend_dev_buffer_type(dev);
75
+
76
+ dev_name = ggml_backend_dev_name(dev);
77
+ }
78
+
79
+ LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
80
+
81
+ ggml_context * ctx = ctx_for_buft(buft);
82
+ if (!ctx) {
83
+ throw std::runtime_error("failed to create ggml context for kv cache");
84
+ }
85
+
86
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
87
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
88
+ ggml_format_name(k, "cache_k_l%d", i);
89
+ ggml_format_name(v, "cache_v_l%d", i);
90
+ k_l.push_back(k);
91
+ v_l.push_back(v);
92
+ }
93
+
94
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
95
+ for (auto it : ctx_map) {
96
+ auto * buft = it.first;
97
+ auto * ctx = it.second;
98
+
99
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
100
+ if (!buf) {
101
+ throw std::runtime_error("failed to allocate buffer for kv cache");
102
+ }
103
+ ggml_backend_buffer_clear(buf, 0);
104
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
105
+ bufs.emplace_back(buf);
106
+ }
107
+
108
+ {
109
+ const size_t memory_size_k = size_k_bytes();
110
+ const size_t memory_size_v = size_v_bytes();
111
+
112
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
113
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
114
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
115
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
116
+ }
117
+ }
118
+
119
+ void llama_kv_cache_recurrent::clear() {
120
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
121
+ cells[i].pos = -1;
122
+ cells[i].seq_id.clear();
123
+ cells[i].src = -1;
124
+ cells[i].tail = -1;
125
+ }
126
+ head = 0;
127
+ used = 0;
128
+
129
+ for (auto & buf : bufs) {
130
+ ggml_backend_buffer_clear(buf.get(), 0);
131
+ }
132
+ }
133
+
134
+ bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
135
+ uint32_t new_head = size;
136
+
137
+ if (p0 < 0) {
138
+ p0 = 0;
139
+ }
140
+
141
+ if (p1 < 0) {
142
+ p1 = std::numeric_limits<llama_pos>::max();
143
+ }
144
+
145
+ // models like Mamba or RWKV can't have a state partially erased
146
+ if (seq_id >= (int64_t) size) {
147
+ // could be fatal
148
+ return false;
149
+ }
150
+ if (0 <= seq_id) {
151
+ int32_t & tail_id = cells[seq_id].tail;
152
+ if (tail_id >= 0) {
153
+ const kv_cell & cell = cells[tail_id];
154
+ // partial intersection is invalid
155
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
156
+ return false;
157
+ }
158
+ // invalidate tails which will be cleared
159
+ if (p0 <= cell.pos && cell.pos < p1) {
160
+ tail_id = -1;
161
+ }
162
+ }
163
+ } else {
164
+ // seq_id is negative, then the range should include everything or nothing
165
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
166
+ return false;
167
+ }
168
+ }
169
+
170
+ for (uint32_t i = 0; i < size; ++i) {
171
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
172
+ if (seq_id < 0) {
173
+ cells[i].seq_id.clear();
174
+ } else if (cells[i].has_seq_id(seq_id)) {
175
+ cells[i].seq_id.erase(seq_id);
176
+ } else {
177
+ continue;
178
+ }
179
+ if (cells[i].is_empty()) {
180
+ // keep count of the number of used cells
181
+ if (cells[i].pos >= 0) {
182
+ used--;
183
+ }
184
+ cells[i].pos = -1;
185
+ cells[i].src = -1;
186
+ if (new_head == size) {
187
+ new_head = i;
188
+ }
189
+ }
190
+ }
191
+ }
192
+
193
+ // If we freed up a slot, set head to it so searching can start there.
194
+ if (new_head != size && new_head < head) {
195
+ head = new_head;
196
+ }
197
+
198
+ return true;
199
+ }
200
+
201
+ void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
202
+ if (seq_id_src == seq_id_dst) {
203
+ return;
204
+ }
205
+
206
+ if (p0 < 0) {
207
+ p0 = 0;
208
+ }
209
+
210
+ if (p1 < 0) {
211
+ p1 = std::numeric_limits<llama_pos>::max();
212
+ }
213
+
214
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
215
+ kv_cell & tail_src = cells[seq_id_src];
216
+ kv_cell & tail_dst = cells[seq_id_dst];
217
+ if (tail_dst.tail >= 0) {
218
+ // clear destination seq_id if it wasn't empty
219
+ kv_cell & cell_dst = cells[tail_dst.tail];
220
+
221
+ cell_dst.seq_id.erase(seq_id_dst);
222
+ tail_dst.tail = -1;
223
+ if (cell_dst.seq_id.empty()) {
224
+ cell_dst.pos = -1;
225
+ cell_dst.src = -1;
226
+ used -= 1;
227
+ }
228
+ }
229
+ if (tail_src.tail >= 0) {
230
+ kv_cell & cell_src = cells[tail_src.tail];
231
+
232
+ cell_src.seq_id.insert(seq_id_dst);
233
+ tail_dst.tail = tail_src.tail;
234
+ }
235
+ }
236
+ }
237
+
238
+ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
239
+ uint32_t new_head = size;
240
+
241
+ for (uint32_t i = 0; i < size; ++i) {
242
+ if ((llama_seq_id) i != seq_id) {
243
+ cells[i].tail = -1;
244
+ }
245
+
246
+ if (!cells[i].has_seq_id(seq_id)) {
247
+ if (cells[i].pos >= 0) {
248
+ used--;
249
+ }
250
+
251
+ cells[i].pos = -1;
252
+ cells[i].src = -1;
253
+ cells[i].seq_id.clear();
254
+
255
+ if (new_head == size){
256
+ new_head = i;
257
+ }
258
+ } else {
259
+ cells[i].seq_id.clear();
260
+ cells[i].seq_id.insert(seq_id);
261
+ }
262
+ }
263
+
264
+ // If we freed up a slot, set head to it so searching can start there.
265
+ if (new_head != size && new_head < head) {
266
+ head = new_head;
267
+ }
268
+ }
269
+
270
+ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
271
+ if (shift == 0) {
272
+ return;
273
+ }
274
+
275
+ if (p0 < 0) {
276
+ p0 = 0;
277
+ }
278
+
279
+ if (p1 < 0) {
280
+ p1 = std::numeric_limits<llama_pos>::max();
281
+ }
282
+
283
+ // If there is no range then return early to avoid looping over the
284
+ if (p0 == p1) {
285
+ return;
286
+ }
287
+
288
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
289
+ if (0 <= seq_id && seq_id < (int64_t) size) {
290
+ const int32_t tail_id = cells[seq_id].tail;
291
+ if (tail_id >= 0) {
292
+ kv_cell & cell = cells[tail_id];
293
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
294
+ cell.pos += shift;
295
+ }
296
+ }
297
+ }
298
+ }
299
+
300
+ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
301
+ if (d == 1) {
302
+ return;
303
+ }
304
+
305
+ if (p0 < 0) {
306
+ p0 = 0;
307
+ }
308
+
309
+ if (p1 < 0) {
310
+ p1 = std::numeric_limits<llama_pos>::max();
311
+ }
312
+
313
+ // If there is no range then return early to avoid looping over the cache.
314
+ if (p0 == p1) {
315
+ return;
316
+ }
317
+
318
+ // for Mamba-like or RWKV models, only the pos needs to be changed
319
+ if (0 <= seq_id && seq_id < (int64_t) size) {
320
+ const int32_t tail_id = cells[seq_id].tail;
321
+ if (tail_id >= 0) {
322
+ kv_cell & cell = cells[tail_id];
323
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
324
+ cell.pos /= d;
325
+ }
326
+ }
327
+ }
328
+ }
329
+
330
+ llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
331
+ llama_pos result = std::numeric_limits<llama_pos>::max();
332
+
333
+ for (uint32_t i = 0; i < size; ++i) {
334
+ if (cells[i].has_seq_id(seq_id)) {
335
+ result = std::min(result, cells[i].pos);
336
+ }
337
+ }
338
+
339
+ if (result == std::numeric_limits<llama_pos>::max()) {
340
+ result = -1;
341
+ }
342
+
343
+ return result;
344
+ }
345
+
346
+ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
347
+ llama_pos result = -1;
348
+
349
+ for (uint32_t i = 0; i < size; ++i) {
350
+ if (cells[i].has_seq_id(seq_id)) {
351
+ result = std::max(result, cells[i].pos);
352
+ }
353
+ }
354
+
355
+ return result;
356
+ }
357
+
358
+ llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
359
+ GGML_UNUSED(embd_pooled);
360
+
361
+ auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
362
+
363
+ std::vector<llama_ubatch> ubatches;
364
+
365
+ while (sbatch.n_tokens > 0) {
366
+ llama_ubatch ubatch;
367
+
368
+ if (embd_pooled) {
369
+ // Pooled embeddings cannot be split across ubatches (yet)
370
+ ubatch = sbatch.split_seq(n_ubatch);
371
+ } else {
372
+ ubatch = sbatch.split_equal(n_ubatch);
373
+ }
374
+
375
+ ubatches.push_back(ubatch);
376
+ }
377
+
378
+ if (!prepare(ubatches)) {
379
+ return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
380
+ }
381
+
382
+ return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this, std::move(sbatch), std::move(ubatches));
383
+ }
384
+
385
+ llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
386
+ return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
387
+ }
388
+
389
+ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
390
+ // simply remember the full state because it is very small for this type of cache
391
+ // TODO: optimize
392
+ auto org_cells = cells;
393
+ auto org_used = used;
394
+ auto org_head = head;
395
+
396
+ bool success = true;
397
+
398
+ // TODO: here we have to verify that all ubatches can fit in the cells
399
+ // however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
400
+ // during the compute of each ubatch. to reproduce, uncomment the following loop and run:
401
+ //
402
+ // $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
403
+ //
404
+ // recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
405
+ //
406
+ GGML_UNUSED(ubatches);
407
+ //for (const auto & ubatch : ubatches) {
408
+ // if (!find_slot(ubatch)) {
409
+ // success = false;
410
+ // break;
411
+ // }
412
+ //}
413
+
414
+ // restore the original state
415
+ cells = std::move(org_cells);
416
+ used = org_used;
417
+ head = org_head;
418
+
419
+ return success;
420
+ }
421
+
422
+ bool llama_kv_cache_recurrent::update(llama_context & lctx) {
423
+ GGML_UNUSED(lctx);
424
+ // noop
425
+ return false;
426
+ }
427
+
428
+ void llama_kv_cache_recurrent::defrag_sched(float thold) {
429
+ GGML_UNUSED(thold);
430
+ // noop
431
+ }
432
+
433
+ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
434
+ const uint32_t n_tokens = ubatch.n_tokens;
435
+ const uint32_t n_seqs = ubatch.n_seqs;
436
+
437
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
438
+
439
+ // if we have enough unused cells before the current head ->
440
+ // better to start searching from the beginning of the cache, hoping to fill it
441
+ if (head > used + 2*n_tokens) {
442
+ head = 0;
443
+ }
444
+
445
+ // For recurrent state architectures (like Mamba or RWKV),
446
+ // each cache cell can store the state for a whole sequence.
447
+ // A slot should be always be contiguous.
448
+
449
+ // can only process batches with an equal number of new tokens in each sequence
450
+ GGML_ASSERT(ubatch.equal_seqs);
451
+
452
+ int32_t min = size - 1;
453
+ int32_t max = 0;
454
+
455
+ // everything should fit if all seq_ids are smaller than the max
456
+ for (uint32_t s = 0; s < n_seqs; ++s) {
457
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
458
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
459
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
460
+
461
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
462
+ // too big seq_id
463
+ // TODO: would it be possible to resize the cache instead?
464
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
465
+ return false;
466
+ }
467
+ if (j > 0) {
468
+ kv_cell & seq = cells[seq_id];
469
+ if (seq.tail >= 0) {
470
+ kv_cell & cell = cells[seq.tail];
471
+ // clear cells from seq_ids that become shared
472
+ // (should not normally happen, but let's handle it anyway)
473
+ cell.seq_id.erase(seq_id);
474
+ seq.tail = -1;
475
+ if (cell.seq_id.empty()) {
476
+ cell.pos = -1;
477
+ cell.src = -1;
478
+ used -= 1;
479
+ }
480
+ }
481
+ }
482
+ }
483
+ }
484
+
485
+ #ifndef NDEBUG
486
+ {
487
+ std::vector<int32_t> tails_verif;
488
+ tails_verif.assign(size, -1);
489
+ for (uint32_t i = 0; i < size; ++i) {
490
+ kv_cell & cell = cells[i];
491
+ for (llama_seq_id seq_id : cell.seq_id) {
492
+ if (tails_verif[seq_id] != -1) {
493
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
494
+ }
495
+ tails_verif[seq_id] = i;
496
+ }
497
+ }
498
+ for (uint32_t i = 0; i < size; ++i) {
499
+ if (tails_verif[i] != cells[i].tail) {
500
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
501
+ }
502
+ }
503
+ }
504
+ #endif
505
+
506
+ // find next empty cell
507
+ uint32_t next_empty_cell = head;
508
+
509
+ for (uint32_t i = 0; i < size; ++i) {
510
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
511
+ kv_cell & cell = cells[next_empty_cell];
512
+ if (cell.is_empty()) { break; }
513
+ next_empty_cell += 1;
514
+ }
515
+
516
+ // find usable cell range
517
+ for (uint32_t s = 0; s < n_seqs; ++s) {
518
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
519
+ kv_cell & seq_meta = cells[seq_id];
520
+ bool has_cell = false;
521
+ if (seq_meta.tail >= 0) {
522
+ kv_cell & cell = cells[seq_meta.tail];
523
+ GGML_ASSERT(cell.has_seq_id(seq_id));
524
+ // does this seq_id "own" the cell?
525
+ if (cell.seq_id.size() == 1) { has_cell = true; }
526
+ }
527
+ if (!has_cell) {
528
+ kv_cell & empty_cell = cells[next_empty_cell];
529
+ GGML_ASSERT(empty_cell.is_empty());
530
+ // copy old tail into the empty cell
531
+ if (seq_meta.tail >= 0) {
532
+ kv_cell & orig_cell = cells[seq_meta.tail];
533
+ empty_cell.pos = orig_cell.pos;
534
+ empty_cell.src = orig_cell.src;
535
+ orig_cell.seq_id.erase(seq_id);
536
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
537
+ }
538
+ seq_meta.tail = next_empty_cell;
539
+ // find next empty cell
540
+ if (s + 1 < n_seqs) {
541
+ next_empty_cell += 1;
542
+ for (uint32_t i = 0; i < size; ++i) {
543
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
544
+ kv_cell & cell = cells[next_empty_cell];
545
+ if (cell.is_empty()) { break; }
546
+ next_empty_cell += 1;
547
+ }
548
+ }
549
+ }
550
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
551
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
552
+ }
553
+
554
+ // gather and re-order
555
+ for (uint32_t s = 0; s < n_seqs; ++s) {
556
+ int32_t dst_id = s + min;
557
+ int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
558
+ if (dst_id != src_id) {
559
+ kv_cell & dst_cell = cells[dst_id];
560
+ kv_cell & src_cell = cells[src_id];
561
+
562
+ std::swap(dst_cell.pos, src_cell.pos);
563
+ std::swap(dst_cell.src, src_cell.src);
564
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
565
+
566
+ // swap tails (assuming they NEVER overlap)
567
+ for (const llama_seq_id seq_id : src_cell.seq_id) {
568
+ cells[seq_id].tail = src_id;
569
+ }
570
+ for (const llama_seq_id seq_id : dst_cell.seq_id) {
571
+ cells[seq_id].tail = dst_id;
572
+ }
573
+ }
574
+ }
575
+
576
+ // update the pos of the used seqs
577
+ for (uint32_t s = 0; s < n_seqs; ++s) {
578
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
579
+ int32_t cell_id = s + min;
580
+ kv_cell & cell = cells[cell_id];
581
+
582
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
583
+ // What should happen when the pos backtracks or skips a value?
584
+ // Clearing the state mid-batch would require special-casing which isn't done.
585
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
586
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
587
+ }
588
+ cell.pos = last_pos;
589
+ cell.seq_id.clear();
590
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
591
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
592
+ cell.seq_id.insert(seq_id);
593
+ cells[seq_id].tail = cell_id;
594
+ }
595
+ }
596
+
597
+ // allow getting the range of used cells, from head to head + n
598
+ head = min;
599
+ n = max - min + 1;
600
+ used = std::count_if(cells.begin(), cells.end(),
601
+ [](const kv_cell & cell){ return !cell.is_empty(); });
602
+
603
+ // sanity check
604
+ return n >= n_seqs;
605
+ }
606
+
607
+ bool llama_kv_cache_recurrent::get_can_shift() const {
608
+ return false;
609
+ }
610
+
611
+ int32_t llama_kv_cache_recurrent::s_copy(int i) const {
612
+ const uint32_t cell_id = i + head;
613
+
614
+ //////////////////////////////////////////////
615
+ // TODO: this should not mutate the KV cache !
616
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
617
+
618
+ // prevent out-of-bound sources
619
+ if (cell.src < 0 || (uint32_t) cell.src >= size) {
620
+ cell.src = cell_id;
621
+ }
622
+
623
+ int32_t res = cell.src;
624
+
625
+ // TODO: do not mutate the KV cache
626
+ // ensure copy only happens once
627
+ if (cell.src != (int32_t) cell_id) {
628
+ cell.src = cell_id;
629
+ }
630
+
631
+ return res;
632
+ }
633
+
634
+ float llama_kv_cache_recurrent::s_mask(int i) const {
635
+ const uint32_t cell_id = i + head;
636
+
637
+ //////////////////////////////////////////////
638
+ // TODO: this should not mutate the KV cache !
639
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
640
+
641
+ float res = (float) (cell.src >= 0);
642
+
643
+ // only clear once
644
+ if (cell.src < 0) {
645
+ cell.src = cell_id;
646
+ }
647
+
648
+ return res;
649
+ }
650
+
651
+ size_t llama_kv_cache_recurrent::total_size() const {
652
+ size_t size = 0;
653
+ for (const auto & buf : bufs) {
654
+ size += ggml_backend_buffer_get_size(buf.get());
655
+ }
656
+
657
+ return size;
658
+ }
659
+
660
+ size_t llama_kv_cache_recurrent::size_k_bytes() const {
661
+ size_t size_k_bytes = 0;
662
+
663
+ for (const auto & k : k_l) {
664
+ size_k_bytes += ggml_nbytes(k);
665
+ }
666
+
667
+ return size_k_bytes;
668
+ }
669
+
670
+ size_t llama_kv_cache_recurrent::size_v_bytes() const {
671
+ size_t size_v_bytes = 0;
672
+
673
+ for (const auto & v : v_l) {
674
+ size_v_bytes += ggml_nbytes(v);
675
+ }
676
+
677
+ return size_v_bytes;
678
+ }
679
+
680
+ void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
681
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
682
+ uint32_t cell_count = 0;
683
+
684
+ // Count the number of cells with the specified seq_id
685
+ // Find all the ranges of cells with this seq id (or all, when -1)
686
+ uint32_t cell_range_begin = size;
687
+ for (uint32_t i = 0; i < size; ++i) {
688
+ const auto & cell = cells[i];
689
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
690
+ ++cell_count;
691
+ if (cell_range_begin == size) {
692
+ cell_range_begin = i;
693
+ }
694
+ } else {
695
+ if (cell_range_begin != size) {
696
+ cell_ranges.emplace_back(cell_range_begin, i);
697
+ cell_range_begin = size;
698
+ }
699
+ }
700
+ }
701
+ if (cell_range_begin != size) {
702
+ cell_ranges.emplace_back(cell_range_begin, size);
703
+ }
704
+
705
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
706
+ uint32_t cell_count_check = 0;
707
+ for (const auto & range : cell_ranges) {
708
+ cell_count_check += range.second - range.first;
709
+ }
710
+ GGML_ASSERT(cell_count == cell_count_check);
711
+
712
+ io.write(&cell_count, sizeof(cell_count));
713
+
714
+ state_write_meta(io, cell_ranges, seq_id);
715
+ state_write_data(io, cell_ranges);
716
+ }
717
+
718
+ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
719
+ uint32_t cell_count;
720
+ io.read_to(&cell_count, sizeof(cell_count));
721
+
722
+ bool res = true;
723
+
724
+ res = res && state_read_meta(io, cell_count, seq_id);
725
+ res = res && state_read_data(io, cell_count);
726
+
727
+ if (!res) {
728
+ if (seq_id == -1) {
729
+ clear();
730
+ } else {
731
+ seq_rm(seq_id, -1, -1);
732
+ }
733
+ throw std::runtime_error("failed to restore kv cache");
734
+ }
735
+ }
736
+
737
+ void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
738
+ for (const auto & range : cell_ranges) {
739
+ for (uint32_t i = range.first; i < range.second; ++i) {
740
+ const auto & cell = cells[i];
741
+ const llama_pos pos = cell.pos;
742
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
743
+
744
+ io.write(&pos, sizeof(pos));
745
+ io.write(&n_seq_id, sizeof(n_seq_id));
746
+
747
+ if (n_seq_id) {
748
+ for (auto seq_id : cell.seq_id) {
749
+ io.write(&seq_id, sizeof(seq_id));
750
+ }
751
+ }
752
+ }
753
+ }
754
+ }
755
+
756
+ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
757
+ const uint32_t v_trans = 0;
758
+ const uint32_t n_layer = hparams.n_layer;
759
+
760
+ io.write(&v_trans, sizeof(v_trans));
761
+ io.write(&n_layer, sizeof(n_layer));
762
+
763
+ std::vector<uint8_t> tmp_buf;
764
+
765
+ // Iterate and write all the keys first, each row is a cell
766
+ // Get whole range at a time
767
+ for (uint32_t il = 0; il < n_layer; ++il) {
768
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
769
+
770
+ // Write key type
771
+ const int32_t k_type_i = (int32_t)k_l[il]->type;
772
+ io.write(&k_type_i, sizeof(k_type_i));
773
+
774
+ // Write row size of key
775
+ const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
776
+ io.write(&k_size_row, sizeof(k_size_row));
777
+
778
+ // Read each range of cells of k_size length each into tmp_buf and write out
779
+ for (const auto & range : cell_ranges) {
780
+ const size_t range_size = range.second - range.first;
781
+ const size_t buf_size = range_size * k_size_row;
782
+ io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
783
+ }
784
+ }
785
+
786
+ if (!v_trans) {
787
+ for (uint32_t il = 0; il < n_layer; ++il) {
788
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
789
+
790
+ // Write value type
791
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
792
+ io.write(&v_type_i, sizeof(v_type_i));
793
+
794
+ // Write row size of value
795
+ const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
796
+ io.write(&v_size_row, sizeof(v_size_row));
797
+
798
+ // Read each range of cells of v_size length each into tmp_buf and write out
799
+ for (const auto & range : cell_ranges) {
800
+ const size_t range_size = range.second - range.first;
801
+ const size_t buf_size = range_size * v_size_row;
802
+ io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
803
+ }
804
+ }
805
+ } else {
806
+ // When v is transposed, we also need the element size and get the element ranges from each row
807
+ const uint32_t kv_size = size;
808
+ for (uint32_t il = 0; il < n_layer; ++il) {
809
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
810
+
811
+ // Write value type
812
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
813
+ io.write(&v_type_i, sizeof(v_type_i));
814
+
815
+ // Write element size
816
+ const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
817
+ io.write(&v_size_el, sizeof(v_size_el));
818
+
819
+ // Write GQA embedding size
820
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
821
+
822
+ // For each row, we get the element values of each cell
823
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
824
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
825
+ for (const auto & range : cell_ranges) {
826
+ const size_t range_size = range.second - range.first;
827
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
828
+ const size_t buf_size = range_size * v_size_el;
829
+ io.write_tensor(v_l[il], src_offset, buf_size);
830
+ }
831
+ }
832
+ }
833
+ }
834
+ }
835
+
836
+ bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
837
+ if (dest_seq_id != -1) {
838
+ // single sequence
839
+
840
+ seq_rm(dest_seq_id, -1, -1);
841
+
842
+ llama_sbatch sbatch;
843
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
844
+
845
+ batch.n_tokens = cell_count;
846
+ batch.n_seq_tokens = cell_count;
847
+ batch.n_seqs = 1;
848
+
849
+ for (uint32_t i = 0; i < cell_count; ++i) {
850
+ llama_pos pos;
851
+ uint32_t n_seq_id;
852
+
853
+ io.read_to(&pos, sizeof(pos));
854
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
855
+
856
+ if (n_seq_id != 0) {
857
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
858
+ return false;
859
+ }
860
+
861
+ batch.pos[i] = pos;
862
+ }
863
+ batch.n_seq_id[0] = 1;
864
+ batch.seq_id[0] = &dest_seq_id;
865
+
866
+ if (!find_slot(batch)) {
867
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
868
+ return false;
869
+ }
870
+
871
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
872
+ // Assume that this is one contiguous block of cells
873
+ GGML_ASSERT(head + cell_count <= size);
874
+ GGML_ASSERT(cells[head].pos == batch.pos[0]);
875
+ GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
876
+ GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
877
+ GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
878
+ } else {
879
+ // whole KV cache restore
880
+
881
+ if (cell_count > size) {
882
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
883
+ return false;
884
+ }
885
+
886
+ clear();
887
+
888
+ for (uint32_t i = 0; i < cell_count; ++i) {
889
+ kv_cell & cell = cells[i];
890
+
891
+ llama_pos pos;
892
+ uint32_t n_seq_id;
893
+
894
+ io.read_to(&pos, sizeof(pos));
895
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
896
+
897
+ cell.pos = pos;
898
+
899
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
900
+ llama_seq_id seq_id;
901
+ io.read_to(&seq_id, sizeof(seq_id));
902
+
903
+ // TODO: llama_kv_cache_recurrent should have a notion of max sequences
904
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
905
+ if (seq_id < 0) {
906
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
907
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
908
+ return false;
909
+ }
910
+
911
+ cell.seq_id.insert(seq_id);
912
+
913
+ int32_t & tail = cells[seq_id].tail;
914
+ if (tail != -1) {
915
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
916
+ return false;
917
+ }
918
+ tail = i;
919
+ }
920
+ }
921
+
922
+ head = 0;
923
+ used = cell_count;
924
+ }
925
+
926
+ for (uint32_t i = 0; i < cell_count; ++i) {
927
+ uint32_t cell_id = head + i;
928
+ // make sure the recurrent states will keep their restored state
929
+ cells[cell_id].src = cell_id;
930
+ }
931
+
932
+ return true;
933
+ }
934
+
935
+ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
936
+ uint32_t v_trans;
937
+ uint32_t n_layer;
938
+ io.read_to(&v_trans, sizeof(v_trans));
939
+ io.read_to(&n_layer, sizeof(n_layer));
940
+
941
+ if (n_layer != hparams.n_layer) {
942
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
943
+ return false;
944
+ }
945
+ if (cell_count > size) {
946
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
947
+ return false;
948
+ }
949
+ if (false != (bool) v_trans) {
950
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
951
+ return false;
952
+ }
953
+
954
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
955
+ for (uint32_t il = 0; il < n_layer; ++il) {
956
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
957
+
958
+ // Read type of key
959
+ int32_t k_type_i_ref;
960
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
961
+ const int32_t k_type_i = (int32_t) k_l[il]->type;
962
+ if (k_type_i != k_type_i_ref) {
963
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
964
+ return false;
965
+ }
966
+
967
+ // Read row size of key
968
+ uint64_t k_size_row_ref;
969
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
970
+ const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
971
+ if (k_size_row != k_size_row_ref) {
972
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
973
+ return false;
974
+ }
975
+
976
+ if (cell_count) {
977
+ // Read and set the keys for the whole cell range
978
+ ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
979
+ }
980
+ }
981
+
982
+ if (!v_trans) {
983
+ for (uint32_t il = 0; il < n_layer; ++il) {
984
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
985
+
986
+ // Read type of value
987
+ int32_t v_type_i_ref;
988
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
989
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
990
+ if (v_type_i != v_type_i_ref) {
991
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
992
+ return false;
993
+ }
994
+
995
+ // Read row size of value
996
+ uint64_t v_size_row_ref;
997
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
998
+ const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
999
+ if (v_size_row != v_size_row_ref) {
1000
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1001
+ return false;
1002
+ }
1003
+
1004
+ if (cell_count) {
1005
+ // Read and set the values for the whole cell range
1006
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1007
+ }
1008
+ }
1009
+ } else {
1010
+ // For each layer, read the values for each cell (transposed)
1011
+ for (uint32_t il = 0; il < n_layer; ++il) {
1012
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1013
+
1014
+ // Read type of value
1015
+ int32_t v_type_i_ref;
1016
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1017
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1018
+ if (v_type_i != v_type_i_ref) {
1019
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1020
+ return false;
1021
+ }
1022
+
1023
+ // Read element size of value
1024
+ uint32_t v_size_el_ref;
1025
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1026
+ const size_t v_size_el = ggml_type_size(v_l[il]->type);
1027
+ if (v_size_el != v_size_el_ref) {
1028
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1029
+ return false;
1030
+ }
1031
+
1032
+ // Read GQA embedding size
1033
+ uint32_t n_embd_v_gqa_ref;
1034
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1035
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1036
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1037
+ return false;
1038
+ }
1039
+
1040
+ if (cell_count) {
1041
+ // For each row in the transposed matrix, read the values for the whole cell range
1042
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1043
+ const size_t dst_offset = (head + j * size) * v_size_el;
1044
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1045
+ }
1046
+ }
1047
+ }
1048
+ }
1049
+
1050
+ return true;
1051
+ }
1052
+
1053
+ //
1054
+ // llama_kv_cache_recurrent_state
1055
+ //
1056
+
1057
+ llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(llama_memory_status status) : status(status) {}
1058
+
1059
+ llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
1060
+ llama_memory_status status,
1061
+ llama_kv_cache_recurrent * kv) : status(status), kv(kv), is_full(true) {
1062
+ }
1063
+
1064
+ llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
1065
+ llama_memory_status status,
1066
+ llama_kv_cache_recurrent * kv,
1067
+ llama_sbatch sbatch,
1068
+ std::vector<llama_ubatch> ubatches) : status(status), kv(kv), sbatch(std::move(sbatch)), ubatches(std::move(ubatches)) {}
1069
+
1070
+ llama_kv_cache_recurrent_state::~llama_kv_cache_recurrent_state() = default;
1071
+
1072
+ bool llama_kv_cache_recurrent_state::next() {
1073
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1074
+
1075
+ if (++i_next >= ubatches.size()) {
1076
+ return false;
1077
+ }
1078
+
1079
+ return true;
1080
+ }
1081
+
1082
+ bool llama_kv_cache_recurrent_state::apply() {
1083
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1084
+
1085
+ kv->find_slot(ubatches[i_next]);
1086
+
1087
+ return true;
1088
+ }
1089
+
1090
+ std::vector<int64_t> & llama_kv_cache_recurrent_state::out_ids() {
1091
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1092
+
1093
+ return sbatch.out_ids;
1094
+ }
1095
+
1096
+ llama_memory_status llama_kv_cache_recurrent_state::get_status() const {
1097
+ return status;
1098
+ }
1099
+
1100
+ const llama_ubatch & llama_kv_cache_recurrent_state::get_ubatch() const {
1101
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1102
+
1103
+ return ubatches[i_next];
1104
+ }
1105
+
1106
+ uint32_t llama_kv_cache_recurrent_state::get_n_kv() const {
1107
+ return is_full ? kv->size : kv->n;
1108
+ }
1109
+
1110
+ uint32_t llama_kv_cache_recurrent_state::get_head() const {
1111
+ return is_full ? 0 : kv->head;
1112
+ }
1113
+
1114
+ uint32_t llama_kv_cache_recurrent_state::get_size() const {
1115
+ return kv->size;
1116
+ }
1117
+
1118
+ ggml_tensor * llama_kv_cache_recurrent_state::get_k_l(int32_t il) const {
1119
+ return kv->k_l[il];
1120
+ }
1121
+
1122
+ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
1123
+ return kv->v_l[il];
1124
+ }
1125
+
1126
+ int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
1127
+ return kv->s_copy(i);
1128
+ }
1129
+
1130
+ float llama_kv_cache_recurrent_state::s_mask(int i) const {
1131
+ return kv->s_mask(i);
1132
+ }