cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
@@ -1,718 +1,718 @@
1
- #include "llama-kv-cache.h"
2
-
3
- #include "llama-impl.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
-
8
- #include <algorithm>
9
- #include <limits>
10
- #include <map>
11
-
12
- static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
13
-
14
- uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
15
- // the FA kernels require padding to avoid extra runtime boundary checks
16
- return cparams.flash_attn ? 256u : 32u;
17
- }
18
-
19
- bool llama_kv_cache_init(
20
- struct llama_kv_cache & cache,
21
- const llama_model & model,
22
- const llama_cparams & cparams,
23
- lm_ggml_type type_k,
24
- lm_ggml_type type_v,
25
- uint32_t kv_size,
26
- bool offload) {
27
- const struct llama_hparams & hparams = model.hparams;
28
-
29
- const int32_t n_layer = hparams.n_layer;
30
-
31
- cache.has_shift = false;
32
-
33
- cache.recurrent = llama_model_is_recurrent(&model);
34
- cache.v_trans = !cache.recurrent && !cparams.flash_attn;
35
- cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
36
-
37
- LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
38
- __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, cache.can_shift);
39
-
40
- cache.head = 0;
41
- cache.size = kv_size;
42
- cache.used = 0;
43
-
44
- cache.type_k = type_k;
45
- cache.type_v = type_v;
46
-
47
- cache.cells.clear();
48
- cache.cells.resize(kv_size);
49
-
50
- // create a context for each buffer type
51
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
52
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
53
- auto it = ctx_map.find(buft);
54
- if (it == ctx_map.end()) {
55
- struct lm_ggml_init_params params = {
56
- /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
57
- /*.mem_buffer =*/ NULL,
58
- /*.no_alloc =*/ true,
59
- };
60
- lm_ggml_context * ctx = lm_ggml_init(params);
61
- if (!ctx) {
62
- return nullptr;
63
- }
64
- ctx_map[buft] = ctx;
65
- cache.ctxs.emplace_back(ctx);
66
- return ctx;
67
- }
68
- return it->second;
69
- };
70
-
71
- cache.k_l.reserve(n_layer);
72
- cache.v_l.reserve(n_layer);
73
-
74
- for (int i = 0; i < n_layer; i++) {
75
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
76
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
77
-
78
- LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
79
-
80
- lm_ggml_backend_buffer_type_t buft;
81
- if (offload) {
82
- auto * dev = model.dev_layer.at(i).dev;
83
- buft = lm_ggml_backend_dev_buffer_type(dev);
84
- } else {
85
- buft = lm_ggml_backend_cpu_buffer_type();
86
- }
87
- lm_ggml_context * ctx = ctx_for_buft(buft);
88
-
89
- if (!ctx) {
90
- LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
91
- return false;
92
- }
93
-
94
- lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
95
- lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
96
- lm_ggml_format_name(k, "cache_k_l%d", i);
97
- lm_ggml_format_name(v, "cache_v_l%d", i);
98
- cache.k_l.push_back(k);
99
- cache.v_l.push_back(v);
100
- }
101
-
102
- // allocate tensors and initialize the buffers to avoid NaNs in the padding
103
- for (auto it : ctx_map) {
104
- auto * buft = it.first;
105
- auto * ctx = it.second;
106
-
107
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
108
- if (!buf) {
109
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
110
- return false;
111
- }
112
- lm_ggml_backend_buffer_clear(buf, 0);
113
- LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
114
- cache.bufs.emplace_back(buf);
115
- }
116
-
117
- return true;
118
- }
119
-
120
- struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
121
- struct llama_kv_cache & cache,
122
- const struct llama_ubatch & ubatch) {
123
- const uint32_t n_tokens = ubatch.n_tokens;
124
- const uint32_t n_seqs = ubatch.n_seqs;
125
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
126
-
127
- if (cache.recurrent) {
128
- // For recurrent state architectures (like Mamba or RWKV),
129
- // each cache cell can store the state for a whole sequence.
130
- // A slot should be always be contiguous.
131
-
132
- // can only process batches with an equal number of new tokens in each sequence
133
- LM_GGML_ASSERT(ubatch.equal_seqs);
134
-
135
- int32_t min = cache.size - 1;
136
- int32_t max = 0;
137
-
138
- // everything should fit if all seq_ids are smaller than the max
139
- for (uint32_t s = 0; s < n_seqs; ++s) {
140
- const uint32_t n_seq_id = ubatch.n_seq_id[s];
141
- for (uint32_t j = 0; j < n_seq_id; ++j) {
142
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
143
-
144
- if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
145
- // too big seq_id
146
- // TODO: would it be possible to resize the cache instead?
147
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
148
- return llama_kv_cache_slot_info_failed;
149
- }
150
- if (j > 0) {
151
- llama_kv_cell & seq = cache.cells[seq_id];
152
- if (seq.tail >= 0) {
153
- llama_kv_cell & cell = cache.cells[seq.tail];
154
- // clear cells from seq_ids that become shared
155
- // (should not normally happen, but let's handle it anyway)
156
- cell.seq_id.erase(seq_id);
157
- seq.tail = -1;
158
- if (cell.seq_id.empty()) {
159
- cell.pos = -1;
160
- cell.src = -1;
161
- cache.used -= 1;
162
- }
163
- }
164
- }
165
- }
166
- }
167
-
168
- #ifndef NDEBUG
169
- {
170
- std::vector<int32_t> tails_verif;
171
- tails_verif.assign(cache.size, -1);
172
- for (uint32_t i = 0; i < cache.size; ++i) {
173
- llama_kv_cell & cell = cache.cells[i];
174
- for (llama_seq_id seq_id : cell.seq_id) {
175
- if (tails_verif[seq_id] != -1) {
176
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
177
- }
178
- tails_verif[seq_id] = i;
179
- }
180
- }
181
- for (uint32_t i = 0; i < cache.size; ++i) {
182
- if (tails_verif[i] != cache.cells[i].tail) {
183
- LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
184
- }
185
- }
186
- }
187
- #endif
188
-
189
- // find next empty cell
190
- uint32_t next_empty_cell = cache.head;
191
-
192
- for (uint32_t i = 0; i < cache.size; ++i) {
193
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
194
- llama_kv_cell & cell = cache.cells[next_empty_cell];
195
- if (cell.is_empty()) { break; }
196
- next_empty_cell += 1;
197
- }
198
-
199
- // find usable cell range
200
- for (uint32_t s = 0; s < n_seqs; ++s) {
201
- const llama_seq_id seq_id = ubatch.seq_id[s][0];
202
- llama_kv_cell & seq_meta = cache.cells[seq_id];
203
- bool has_cell = false;
204
- if (seq_meta.tail >= 0) {
205
- llama_kv_cell & cell = cache.cells[seq_meta.tail];
206
- LM_GGML_ASSERT(cell.has_seq_id(seq_id));
207
- // does this seq_id "own" the cell?
208
- if (cell.seq_id.size() == 1) { has_cell = true; }
209
- }
210
- if (!has_cell) {
211
- llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
212
- LM_GGML_ASSERT(empty_cell.is_empty());
213
- // copy old tail into the empty cell
214
- if (seq_meta.tail >= 0) {
215
- llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
216
- empty_cell.pos = orig_cell.pos;
217
- empty_cell.src = orig_cell.src;
218
- orig_cell.seq_id.erase(seq_id);
219
- empty_cell.seq_id.insert(seq_id); // will be overwritten
220
- }
221
- seq_meta.tail = next_empty_cell;
222
- // find next empty cell
223
- if (s + 1 < n_seqs) {
224
- next_empty_cell += 1;
225
- for (uint32_t i = 0; i < cache.size; ++i) {
226
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
227
- llama_kv_cell & cell = cache.cells[next_empty_cell];
228
- if (cell.is_empty()) { break; }
229
- next_empty_cell += 1;
230
- }
231
- }
232
- }
233
- if (min > seq_meta.tail) { min = seq_meta.tail; }
234
- if (max < seq_meta.tail) { max = seq_meta.tail; }
235
- }
236
-
237
- // gather and re-order
238
- for (uint32_t s = 0; s < n_seqs; ++s) {
239
- int32_t dst_id = s + min;
240
- int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
241
- if (dst_id != src_id) {
242
- llama_kv_cell & dst_cell = cache.cells[dst_id];
243
- llama_kv_cell & src_cell = cache.cells[src_id];
244
-
245
- std::swap(dst_cell.pos, src_cell.pos);
246
- std::swap(dst_cell.src, src_cell.src);
247
- std::swap(dst_cell.seq_id, src_cell.seq_id);
248
-
249
- // swap tails (assuming they NEVER overlap)
250
- for (const llama_seq_id seq_id : src_cell.seq_id) {
251
- cache.cells[seq_id].tail = src_id;
252
- }
253
- for (const llama_seq_id seq_id : dst_cell.seq_id) {
254
- cache.cells[seq_id].tail = dst_id;
255
- }
256
- }
257
- }
258
-
259
- // update the pos of the used seqs
260
- for (uint32_t s = 0; s < n_seqs; ++s) {
261
- const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
262
- int32_t cell_id = s + min;
263
- llama_kv_cell & cell = cache.cells[cell_id];
264
-
265
- if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
266
- // What should happen when the pos backtracks or skips a value?
267
- // Clearing the state mid-batch would require special-casing which isn't done.
268
- LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
269
- __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
270
- }
271
- cell.pos = last_pos;
272
- cell.seq_id.clear();
273
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
274
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
275
- cell.seq_id.insert(seq_id);
276
- cache.cells[seq_id].tail = cell_id;
277
- }
278
- }
279
-
280
- // allow getting the range of used cells, from head to head + n
281
- cache.head = min;
282
- cache.n = max - min + 1;
283
- cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
284
- [](const llama_kv_cell& cell){ return !cell.is_empty(); });
285
-
286
- // sanity check
287
- return llama_kv_cache_slot_info(cache.n >= n_seqs);
288
- }
289
- // otherwise, one cell per token.
290
-
291
- if (n_tokens > cache.size) {
292
- LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
293
- return llama_kv_cache_slot_info_failed;
294
- }
295
-
296
- uint32_t n_tested = 0;
297
-
298
- while (true) {
299
- if (cache.head + n_tokens > cache.size) {
300
- n_tested += cache.size - cache.head;
301
- cache.head = 0;
302
- continue;
303
- }
304
-
305
- bool found = true;
306
- for (uint32_t i = 0; i < n_tokens; i++) {
307
- if (cache.cells[cache.head + i].pos >= 0) {
308
- found = false;
309
- cache.head += i + 1;
310
- n_tested += i + 1;
311
- break;
312
- }
313
- }
314
-
315
- if (found) {
316
- break;
317
- }
318
-
319
- if (n_tested >= cache.size) {
320
- //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
321
- return llama_kv_cache_slot_info_failed;
322
- }
323
- }
324
-
325
- for (uint32_t s = 0; s < n_seqs; s++) {
326
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
327
- uint32_t k = s*n_seq_tokens + i;
328
- cache.cells[cache.head + k].pos = ubatch.pos[k];
329
-
330
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
331
- cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
332
- }
333
- }
334
- }
335
-
336
- cache.used += n_tokens;
337
-
338
- return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
339
- }
340
-
341
- uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
342
- for (uint32_t i = cache.size; i > 0; --i) {
343
- const llama_kv_cell & cell = cache.cells[i - 1];
344
-
345
- if (cell.pos >= 0 && !cell.is_empty()) {
346
- return i;
347
- }
348
- }
349
-
350
- return 0;
351
- }
352
-
353
- void llama_kv_cache_clear(struct llama_kv_cache & cache) {
354
- for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
355
- cache.cells[i].pos = -1;
356
- cache.cells[i].seq_id.clear();
357
- cache.cells[i].src = -1;
358
- cache.cells[i].tail = -1;
359
- }
360
- cache.head = 0;
361
- cache.used = 0;
362
-
363
- for (auto & buf : cache.bufs) {
364
- lm_ggml_backend_buffer_clear(buf.get(), 0);
365
- }
366
- }
367
-
368
- bool llama_kv_cache_seq_rm(
369
- struct llama_kv_cache & cache,
370
- llama_seq_id seq_id,
371
- llama_pos p0,
372
- llama_pos p1) {
373
- uint32_t new_head = cache.size;
374
-
375
- if (p0 < 0) p0 = 0;
376
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
377
-
378
- // models like Mamba or RWKV can't have a state partially erased
379
- if (cache.recurrent) {
380
- if (seq_id >= (int64_t) cache.size) {
381
- // could be fatal
382
- return false;
383
- }
384
- if (0 <= seq_id) {
385
- int32_t & tail_id = cache.cells[seq_id].tail;
386
- if (tail_id >= 0) {
387
- const llama_kv_cell & cell = cache.cells[tail_id];
388
- // partial intersection is invalid
389
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
390
- return false;
391
- }
392
- // invalidate tails which will be cleared
393
- if (p0 <= cell.pos && cell.pos < p1) {
394
- tail_id = -1;
395
- }
396
- }
397
- } else {
398
- // seq_id is negative, then the range should include everything or nothing
399
- if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
400
- return false;
401
- }
402
- }
403
- }
404
-
405
- for (uint32_t i = 0; i < cache.size; ++i) {
406
- if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
407
- if (seq_id < 0) {
408
- cache.cells[i].seq_id.clear();
409
- } else if (cache.cells[i].has_seq_id(seq_id)) {
410
- cache.cells[i].seq_id.erase(seq_id);
411
- } else {
412
- continue;
413
- }
414
- if (cache.cells[i].is_empty()) {
415
- // keep count of the number of used cells
416
- if (cache.cells[i].pos >= 0) cache.used--;
417
-
418
- cache.cells[i].pos = -1;
419
- cache.cells[i].src = -1;
420
- if (new_head == cache.size) new_head = i;
421
- }
422
- }
423
- }
424
-
425
- // If we freed up a slot, set head to it so searching can start there.
426
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
427
-
428
- return true;
429
- }
430
-
431
- void llama_kv_cache_seq_cp(
432
- struct llama_kv_cache & cache,
433
- llama_seq_id seq_id_src,
434
- llama_seq_id seq_id_dst,
435
- llama_pos p0,
436
- llama_pos p1) {
437
- if (p0 < 0) p0 = 0;
438
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
439
-
440
- if (cache.recurrent) {
441
- if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
442
- llama_kv_cell & tail_src = cache.cells[seq_id_src];
443
- llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
444
- if (tail_dst.tail >= 0) {
445
- // clear destination seq_id if it wasn't empty
446
- llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
447
-
448
- cell_dst.seq_id.erase(seq_id_dst);
449
- tail_dst.tail = -1;
450
- if (cell_dst.seq_id.empty()) {
451
- cell_dst.pos = -1;
452
- cell_dst.delta = -1;
453
- cell_dst.src = -1;
454
- cache.used -= 1;
455
- }
456
- }
457
- if (tail_src.tail >= 0) {
458
- llama_kv_cell & cell_src = cache.cells[tail_src.tail];
459
-
460
- cell_src.seq_id.insert(seq_id_dst);
461
- tail_dst.tail = tail_src.tail;
462
- }
463
- }
464
-
465
- return;
466
- }
467
- // otherwise, this is the KV cache of a Transformer-like model
468
-
469
- cache.head = 0;
470
-
471
- for (uint32_t i = 0; i < cache.size; ++i) {
472
- if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
473
- cache.cells[i].seq_id.insert(seq_id_dst);
474
- }
475
- }
476
- }
477
-
478
- void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
479
- uint32_t new_head = cache.size;
480
-
481
- for (uint32_t i = 0; i < cache.size; ++i) {
482
- if (cache.recurrent && (llama_seq_id) i != seq_id) {
483
- cache.cells[i].tail = -1;
484
- }
485
- if (!cache.cells[i].has_seq_id(seq_id)) {
486
- if (cache.cells[i].pos >= 0) cache.used--;
487
- cache.cells[i].pos = -1;
488
- cache.cells[i].src = -1;
489
- cache.cells[i].seq_id.clear();
490
- if (new_head == cache.size) new_head = i;
491
- } else {
492
- cache.cells[i].seq_id.clear();
493
- cache.cells[i].seq_id.insert(seq_id);
494
- }
495
- }
496
-
497
- // If we freed up a slot, set head to it so searching can start there.
498
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
499
- }
500
-
501
- void llama_kv_cache_seq_add(
502
- struct llama_kv_cache & cache,
503
- llama_seq_id seq_id,
504
- llama_pos p0,
505
- llama_pos p1,
506
- llama_pos delta) {
507
- uint32_t new_head = cache.size;
508
-
509
- if (p0 < 0) p0 = 0;
510
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
511
- // If there is no range then return early to avoid looping over the cache.
512
- if (p0 == p1) return;
513
-
514
- if (cache.recurrent) {
515
- // for Mamba-like or RWKV models, only the pos needs to be shifted
516
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
517
- const int32_t tail_id = cache.cells[seq_id].tail;
518
- if (tail_id >= 0) {
519
- llama_kv_cell & cell = cache.cells[tail_id];
520
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
521
- cell.pos += delta;
522
- }
523
- }
524
- }
525
- return;
526
- }
527
-
528
- for (uint32_t i = 0; i < cache.size; ++i) {
529
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
530
- cache.has_shift = true;
531
- cache.cells[i].pos += delta;
532
- cache.cells[i].delta += delta;
533
-
534
- if (cache.cells[i].pos < 0) {
535
- if (!cache.cells[i].is_empty()) {
536
- cache.used--;
537
- }
538
- cache.cells[i].pos = -1;
539
- cache.cells[i].seq_id.clear();
540
- if (new_head == cache.size) {
541
- new_head = i;
542
- }
543
- }
544
- }
545
- }
546
-
547
- // If we freed up a slot, set head to it so searching can start there.
548
- // Otherwise we just start the next search from the beginning.
549
- cache.head = new_head != cache.size ? new_head : 0;
550
- }
551
-
552
- void llama_kv_cache_seq_div(
553
- struct llama_kv_cache & cache,
554
- llama_seq_id seq_id,
555
- llama_pos p0,
556
- llama_pos p1,
557
- int d) {
558
- if (p0 < 0) p0 = 0;
559
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
560
- // If there is no range then return early to avoid looping over the cache.
561
- if (p0 == p1) return;
562
-
563
- if (cache.recurrent) {
564
- // for Mamba-like or RWKV models, only the pos needs to be changed
565
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
566
- const int32_t tail_id = cache.cells[seq_id].tail;
567
- if (tail_id >= 0) {
568
- llama_kv_cell & cell = cache.cells[tail_id];
569
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
570
- cell.pos /= d;
571
- }
572
- }
573
- }
574
- return;
575
- }
576
-
577
- for (uint32_t i = 0; i < cache.size; ++i) {
578
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
579
- cache.has_shift = true;
580
-
581
- {
582
- llama_pos p_old = cache.cells[i].pos;
583
- cache.cells[i].pos /= d;
584
- cache.cells[i].delta += cache.cells[i].pos - p_old;
585
- }
586
- }
587
- }
588
- }
589
-
590
- llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
591
- llama_pos result = 0;
592
-
593
- for (uint32_t i = 0; i < cache.size; ++i) {
594
- if (cache.cells[i].has_seq_id(seq_id)) {
595
- result = std::max(result, cache.cells[i].pos);
596
- }
597
- }
598
-
599
- return result;
600
- }
601
-
602
- void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
603
- if (!cache.recurrent) {
604
- cache.do_defrag = true;
605
- }
606
- }
607
-
608
- int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
609
- int result = 0;
610
-
611
- for (uint32_t i = 0; i < kv.size; i++) {
612
- result += kv.cells[i].seq_id.size();
613
- }
614
-
615
- return result;
616
- }
617
-
618
- int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
619
- return kv.used;
620
- }
621
-
622
- bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
623
- return kv.can_shift;
624
- }
625
-
626
- //
627
- // kv cache view
628
- //
629
-
630
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
631
- struct llama_kv_cache_view result = {
632
- /*.n_cells = */ 0,
633
- /*.n_seq_max = */ n_seq_max,
634
- /*.token_count = */ 0,
635
- /*.used_cells = */ llama_get_kv_cache_used_cells(kv),
636
- /*.max_contiguous = */ 0,
637
- /*.max_contiguous_idx = */ -1,
638
- /*.cells = */ nullptr,
639
- /*.cells_sequences = */ nullptr,
640
- };
641
-
642
- return result;
643
- }
644
-
645
- void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
646
- if (view->cells != nullptr) {
647
- free(view->cells);
648
- view->cells = nullptr;
649
- }
650
- if (view->cells_sequences != nullptr) {
651
- free(view->cells_sequences);
652
- view->cells_sequences = nullptr;
653
- }
654
- }
655
-
656
- void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
657
- if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
658
- view->n_cells = int32_t(kv.size);
659
- void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
660
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
661
- view->cells = (struct llama_kv_cache_view_cell *)p;
662
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
663
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
664
- view->cells_sequences = (llama_seq_id *)p;
665
- }
666
-
667
- const std::vector<llama_kv_cell> & kv_cells = kv.cells;
668
- llama_kv_cache_view_cell * c_curr = view->cells;
669
- llama_seq_id * cs_curr = view->cells_sequences;
670
- int32_t used_cells = 0;
671
- int32_t token_count = 0;
672
- int32_t curr_contig_idx = -1;
673
- uint32_t max_contig = 0;
674
- int32_t max_contig_idx = -1;
675
-
676
- for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
677
- const size_t curr_size = kv_cells[i].seq_id.size();
678
- token_count += curr_size;
679
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
680
-
681
- if (curr_size > 0) {
682
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
683
- max_contig = i - curr_contig_idx;
684
- max_contig_idx = curr_contig_idx;
685
- }
686
- curr_contig_idx = -1;
687
- } else if (curr_contig_idx < 0) {
688
- curr_contig_idx = i;
689
- }
690
-
691
- int seq_idx = 0;
692
- for (const llama_seq_id it : kv_cells[i].seq_id) {
693
- if (seq_idx >= view->n_seq_max) {
694
- break;
695
- }
696
- cs_curr[seq_idx] = it;
697
- seq_idx++;
698
- }
699
- if (seq_idx != 0) {
700
- used_cells++;
701
- }
702
- for (; seq_idx < view->n_seq_max; seq_idx++) {
703
- cs_curr[seq_idx] = -1;
704
- }
705
- }
706
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
707
- max_contig_idx = curr_contig_idx;
708
- max_contig = kv_cells.size() - curr_contig_idx;
709
- }
710
- view->max_contiguous = max_contig;
711
- view->max_contiguous_idx = max_contig_idx;
712
- view->token_count = token_count;
713
- view->used_cells = used_cells;
714
- if (uint32_t(used_cells) != kv.used) {
715
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
716
- __func__, kv.used, used_cells);
717
- }
718
- }
1
+ #include "llama-kv-cache.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+
8
+ #include <algorithm>
9
+ #include <limits>
10
+ #include <map>
11
+
12
+ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
13
+
14
+ uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
15
+ // the FA kernels require padding to avoid extra runtime boundary checks
16
+ return cparams.flash_attn ? 256u : 32u;
17
+ }
18
+
19
+ bool llama_kv_cache_init(
20
+ struct llama_kv_cache & cache,
21
+ const llama_model & model,
22
+ const llama_cparams & cparams,
23
+ lm_ggml_type type_k,
24
+ lm_ggml_type type_v,
25
+ uint32_t kv_size,
26
+ bool offload) {
27
+ const struct llama_hparams & hparams = model.hparams;
28
+
29
+ const int32_t n_layer = hparams.n_layer;
30
+
31
+ cache.has_shift = false;
32
+
33
+ cache.recurrent = llama_model_is_recurrent(&model);
34
+ cache.v_trans = !cache.recurrent && !cparams.flash_attn;
35
+ cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
36
+
37
+ LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
38
+ __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, cache.can_shift);
39
+
40
+ cache.head = 0;
41
+ cache.size = kv_size;
42
+ cache.used = 0;
43
+
44
+ cache.type_k = type_k;
45
+ cache.type_v = type_v;
46
+
47
+ cache.cells.clear();
48
+ cache.cells.resize(kv_size);
49
+
50
+ // create a context for each buffer type
51
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
52
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
53
+ auto it = ctx_map.find(buft);
54
+ if (it == ctx_map.end()) {
55
+ struct lm_ggml_init_params params = {
56
+ /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
57
+ /*.mem_buffer =*/ NULL,
58
+ /*.no_alloc =*/ true,
59
+ };
60
+ lm_ggml_context * ctx = lm_ggml_init(params);
61
+ if (!ctx) {
62
+ return nullptr;
63
+ }
64
+ ctx_map[buft] = ctx;
65
+ cache.ctxs.emplace_back(ctx);
66
+ return ctx;
67
+ }
68
+ return it->second;
69
+ };
70
+
71
+ cache.k_l.reserve(n_layer);
72
+ cache.v_l.reserve(n_layer);
73
+
74
+ for (int i = 0; i < n_layer; i++) {
75
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
76
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
77
+
78
+ LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
79
+
80
+ lm_ggml_backend_buffer_type_t buft;
81
+ if (offload) {
82
+ auto * dev = model.dev_layer(i);
83
+ buft = lm_ggml_backend_dev_buffer_type(dev);
84
+ } else {
85
+ buft = lm_ggml_backend_cpu_buffer_type();
86
+ }
87
+ lm_ggml_context * ctx = ctx_for_buft(buft);
88
+
89
+ if (!ctx) {
90
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
91
+ return false;
92
+ }
93
+
94
+ lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
95
+ lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
96
+ lm_ggml_format_name(k, "cache_k_l%d", i);
97
+ lm_ggml_format_name(v, "cache_v_l%d", i);
98
+ cache.k_l.push_back(k);
99
+ cache.v_l.push_back(v);
100
+ }
101
+
102
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
103
+ for (auto it : ctx_map) {
104
+ auto * buft = it.first;
105
+ auto * ctx = it.second;
106
+
107
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
108
+ if (!buf) {
109
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
110
+ return false;
111
+ }
112
+ lm_ggml_backend_buffer_clear(buf, 0);
113
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
114
+ cache.bufs.emplace_back(buf);
115
+ }
116
+
117
+ return true;
118
+ }
119
+
120
+ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
121
+ struct llama_kv_cache & cache,
122
+ const struct llama_ubatch & ubatch) {
123
+ const uint32_t n_tokens = ubatch.n_tokens;
124
+ const uint32_t n_seqs = ubatch.n_seqs;
125
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
126
+
127
+ if (cache.recurrent) {
128
+ // For recurrent state architectures (like Mamba or RWKV),
129
+ // each cache cell can store the state for a whole sequence.
130
+ // A slot should be always be contiguous.
131
+
132
+ // can only process batches with an equal number of new tokens in each sequence
133
+ LM_GGML_ASSERT(ubatch.equal_seqs);
134
+
135
+ int32_t min = cache.size - 1;
136
+ int32_t max = 0;
137
+
138
+ // everything should fit if all seq_ids are smaller than the max
139
+ for (uint32_t s = 0; s < n_seqs; ++s) {
140
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
141
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
142
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
143
+
144
+ if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
145
+ // too big seq_id
146
+ // TODO: would it be possible to resize the cache instead?
147
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
148
+ return llama_kv_cache_slot_info_failed;
149
+ }
150
+ if (j > 0) {
151
+ llama_kv_cell & seq = cache.cells[seq_id];
152
+ if (seq.tail >= 0) {
153
+ llama_kv_cell & cell = cache.cells[seq.tail];
154
+ // clear cells from seq_ids that become shared
155
+ // (should not normally happen, but let's handle it anyway)
156
+ cell.seq_id.erase(seq_id);
157
+ seq.tail = -1;
158
+ if (cell.seq_id.empty()) {
159
+ cell.pos = -1;
160
+ cell.src = -1;
161
+ cache.used -= 1;
162
+ }
163
+ }
164
+ }
165
+ }
166
+ }
167
+
168
+ #ifndef NDEBUG
169
+ {
170
+ std::vector<int32_t> tails_verif;
171
+ tails_verif.assign(cache.size, -1);
172
+ for (uint32_t i = 0; i < cache.size; ++i) {
173
+ llama_kv_cell & cell = cache.cells[i];
174
+ for (llama_seq_id seq_id : cell.seq_id) {
175
+ if (tails_verif[seq_id] != -1) {
176
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
177
+ }
178
+ tails_verif[seq_id] = i;
179
+ }
180
+ }
181
+ for (uint32_t i = 0; i < cache.size; ++i) {
182
+ if (tails_verif[i] != cache.cells[i].tail) {
183
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
184
+ }
185
+ }
186
+ }
187
+ #endif
188
+
189
+ // find next empty cell
190
+ uint32_t next_empty_cell = cache.head;
191
+
192
+ for (uint32_t i = 0; i < cache.size; ++i) {
193
+ if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
194
+ llama_kv_cell & cell = cache.cells[next_empty_cell];
195
+ if (cell.is_empty()) { break; }
196
+ next_empty_cell += 1;
197
+ }
198
+
199
+ // find usable cell range
200
+ for (uint32_t s = 0; s < n_seqs; ++s) {
201
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
202
+ llama_kv_cell & seq_meta = cache.cells[seq_id];
203
+ bool has_cell = false;
204
+ if (seq_meta.tail >= 0) {
205
+ llama_kv_cell & cell = cache.cells[seq_meta.tail];
206
+ LM_GGML_ASSERT(cell.has_seq_id(seq_id));
207
+ // does this seq_id "own" the cell?
208
+ if (cell.seq_id.size() == 1) { has_cell = true; }
209
+ }
210
+ if (!has_cell) {
211
+ llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
212
+ LM_GGML_ASSERT(empty_cell.is_empty());
213
+ // copy old tail into the empty cell
214
+ if (seq_meta.tail >= 0) {
215
+ llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
216
+ empty_cell.pos = orig_cell.pos;
217
+ empty_cell.src = orig_cell.src;
218
+ orig_cell.seq_id.erase(seq_id);
219
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
220
+ }
221
+ seq_meta.tail = next_empty_cell;
222
+ // find next empty cell
223
+ if (s + 1 < n_seqs) {
224
+ next_empty_cell += 1;
225
+ for (uint32_t i = 0; i < cache.size; ++i) {
226
+ if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
227
+ llama_kv_cell & cell = cache.cells[next_empty_cell];
228
+ if (cell.is_empty()) { break; }
229
+ next_empty_cell += 1;
230
+ }
231
+ }
232
+ }
233
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
234
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
235
+ }
236
+
237
+ // gather and re-order
238
+ for (uint32_t s = 0; s < n_seqs; ++s) {
239
+ int32_t dst_id = s + min;
240
+ int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
241
+ if (dst_id != src_id) {
242
+ llama_kv_cell & dst_cell = cache.cells[dst_id];
243
+ llama_kv_cell & src_cell = cache.cells[src_id];
244
+
245
+ std::swap(dst_cell.pos, src_cell.pos);
246
+ std::swap(dst_cell.src, src_cell.src);
247
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
248
+
249
+ // swap tails (assuming they NEVER overlap)
250
+ for (const llama_seq_id seq_id : src_cell.seq_id) {
251
+ cache.cells[seq_id].tail = src_id;
252
+ }
253
+ for (const llama_seq_id seq_id : dst_cell.seq_id) {
254
+ cache.cells[seq_id].tail = dst_id;
255
+ }
256
+ }
257
+ }
258
+
259
+ // update the pos of the used seqs
260
+ for (uint32_t s = 0; s < n_seqs; ++s) {
261
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
262
+ int32_t cell_id = s + min;
263
+ llama_kv_cell & cell = cache.cells[cell_id];
264
+
265
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
266
+ // What should happen when the pos backtracks or skips a value?
267
+ // Clearing the state mid-batch would require special-casing which isn't done.
268
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
269
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
270
+ }
271
+ cell.pos = last_pos;
272
+ cell.seq_id.clear();
273
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
274
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
275
+ cell.seq_id.insert(seq_id);
276
+ cache.cells[seq_id].tail = cell_id;
277
+ }
278
+ }
279
+
280
+ // allow getting the range of used cells, from head to head + n
281
+ cache.head = min;
282
+ cache.n = max - min + 1;
283
+ cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
284
+ [](const llama_kv_cell& cell){ return !cell.is_empty(); });
285
+
286
+ // sanity check
287
+ return llama_kv_cache_slot_info(cache.n >= n_seqs);
288
+ }
289
+ // otherwise, one cell per token.
290
+
291
+ if (n_tokens > cache.size) {
292
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
293
+ return llama_kv_cache_slot_info_failed;
294
+ }
295
+
296
+ uint32_t n_tested = 0;
297
+
298
+ while (true) {
299
+ if (cache.head + n_tokens > cache.size) {
300
+ n_tested += cache.size - cache.head;
301
+ cache.head = 0;
302
+ continue;
303
+ }
304
+
305
+ bool found = true;
306
+ for (uint32_t i = 0; i < n_tokens; i++) {
307
+ if (cache.cells[cache.head + i].pos >= 0) {
308
+ found = false;
309
+ cache.head += i + 1;
310
+ n_tested += i + 1;
311
+ break;
312
+ }
313
+ }
314
+
315
+ if (found) {
316
+ break;
317
+ }
318
+
319
+ if (n_tested >= cache.size) {
320
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
321
+ return llama_kv_cache_slot_info_failed;
322
+ }
323
+ }
324
+
325
+ for (uint32_t s = 0; s < n_seqs; s++) {
326
+ for (uint32_t i = 0; i < n_seq_tokens; ++i) {
327
+ uint32_t k = s*n_seq_tokens + i;
328
+ cache.cells[cache.head + k].pos = ubatch.pos[k];
329
+
330
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
331
+ cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
332
+ }
333
+ }
334
+ }
335
+
336
+ cache.used += n_tokens;
337
+
338
+ return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
339
+ }
340
+
341
+ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
342
+ for (uint32_t i = cache.size; i > 0; --i) {
343
+ const llama_kv_cell & cell = cache.cells[i - 1];
344
+
345
+ if (cell.pos >= 0 && !cell.is_empty()) {
346
+ return i;
347
+ }
348
+ }
349
+
350
+ return 0;
351
+ }
352
+
353
+ void llama_kv_cache_clear(struct llama_kv_cache & cache) {
354
+ for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
355
+ cache.cells[i].pos = -1;
356
+ cache.cells[i].seq_id.clear();
357
+ cache.cells[i].src = -1;
358
+ cache.cells[i].tail = -1;
359
+ }
360
+ cache.head = 0;
361
+ cache.used = 0;
362
+
363
+ for (auto & buf : cache.bufs) {
364
+ lm_ggml_backend_buffer_clear(buf.get(), 0);
365
+ }
366
+ }
367
+
368
+ bool llama_kv_cache_seq_rm(
369
+ struct llama_kv_cache & cache,
370
+ llama_seq_id seq_id,
371
+ llama_pos p0,
372
+ llama_pos p1) {
373
+ uint32_t new_head = cache.size;
374
+
375
+ if (p0 < 0) p0 = 0;
376
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
377
+
378
+ // models like Mamba or RWKV can't have a state partially erased
379
+ if (cache.recurrent) {
380
+ if (seq_id >= (int64_t) cache.size) {
381
+ // could be fatal
382
+ return false;
383
+ }
384
+ if (0 <= seq_id) {
385
+ int32_t & tail_id = cache.cells[seq_id].tail;
386
+ if (tail_id >= 0) {
387
+ const llama_kv_cell & cell = cache.cells[tail_id];
388
+ // partial intersection is invalid
389
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
390
+ return false;
391
+ }
392
+ // invalidate tails which will be cleared
393
+ if (p0 <= cell.pos && cell.pos < p1) {
394
+ tail_id = -1;
395
+ }
396
+ }
397
+ } else {
398
+ // seq_id is negative, then the range should include everything or nothing
399
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
400
+ return false;
401
+ }
402
+ }
403
+ }
404
+
405
+ for (uint32_t i = 0; i < cache.size; ++i) {
406
+ if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
407
+ if (seq_id < 0) {
408
+ cache.cells[i].seq_id.clear();
409
+ } else if (cache.cells[i].has_seq_id(seq_id)) {
410
+ cache.cells[i].seq_id.erase(seq_id);
411
+ } else {
412
+ continue;
413
+ }
414
+ if (cache.cells[i].is_empty()) {
415
+ // keep count of the number of used cells
416
+ if (cache.cells[i].pos >= 0) cache.used--;
417
+
418
+ cache.cells[i].pos = -1;
419
+ cache.cells[i].src = -1;
420
+ if (new_head == cache.size) new_head = i;
421
+ }
422
+ }
423
+ }
424
+
425
+ // If we freed up a slot, set head to it so searching can start there.
426
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
427
+
428
+ return true;
429
+ }
430
+
431
+ void llama_kv_cache_seq_cp(
432
+ struct llama_kv_cache & cache,
433
+ llama_seq_id seq_id_src,
434
+ llama_seq_id seq_id_dst,
435
+ llama_pos p0,
436
+ llama_pos p1) {
437
+ if (p0 < 0) p0 = 0;
438
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
439
+
440
+ if (cache.recurrent) {
441
+ if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
442
+ llama_kv_cell & tail_src = cache.cells[seq_id_src];
443
+ llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
444
+ if (tail_dst.tail >= 0) {
445
+ // clear destination seq_id if it wasn't empty
446
+ llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
447
+
448
+ cell_dst.seq_id.erase(seq_id_dst);
449
+ tail_dst.tail = -1;
450
+ if (cell_dst.seq_id.empty()) {
451
+ cell_dst.pos = -1;
452
+ cell_dst.delta = -1;
453
+ cell_dst.src = -1;
454
+ cache.used -= 1;
455
+ }
456
+ }
457
+ if (tail_src.tail >= 0) {
458
+ llama_kv_cell & cell_src = cache.cells[tail_src.tail];
459
+
460
+ cell_src.seq_id.insert(seq_id_dst);
461
+ tail_dst.tail = tail_src.tail;
462
+ }
463
+ }
464
+
465
+ return;
466
+ }
467
+ // otherwise, this is the KV cache of a Transformer-like model
468
+
469
+ cache.head = 0;
470
+
471
+ for (uint32_t i = 0; i < cache.size; ++i) {
472
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
473
+ cache.cells[i].seq_id.insert(seq_id_dst);
474
+ }
475
+ }
476
+ }
477
+
478
+ void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
479
+ uint32_t new_head = cache.size;
480
+
481
+ for (uint32_t i = 0; i < cache.size; ++i) {
482
+ if (cache.recurrent && (llama_seq_id) i != seq_id) {
483
+ cache.cells[i].tail = -1;
484
+ }
485
+ if (!cache.cells[i].has_seq_id(seq_id)) {
486
+ if (cache.cells[i].pos >= 0) cache.used--;
487
+ cache.cells[i].pos = -1;
488
+ cache.cells[i].src = -1;
489
+ cache.cells[i].seq_id.clear();
490
+ if (new_head == cache.size) new_head = i;
491
+ } else {
492
+ cache.cells[i].seq_id.clear();
493
+ cache.cells[i].seq_id.insert(seq_id);
494
+ }
495
+ }
496
+
497
+ // If we freed up a slot, set head to it so searching can start there.
498
+ if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
499
+ }
500
+
501
+ void llama_kv_cache_seq_add(
502
+ struct llama_kv_cache & cache,
503
+ llama_seq_id seq_id,
504
+ llama_pos p0,
505
+ llama_pos p1,
506
+ llama_pos delta) {
507
+ uint32_t new_head = cache.size;
508
+
509
+ if (p0 < 0) p0 = 0;
510
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
511
+ // If there is no range then return early to avoid looping over the cache.
512
+ if (p0 == p1) return;
513
+
514
+ if (cache.recurrent) {
515
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
516
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
517
+ const int32_t tail_id = cache.cells[seq_id].tail;
518
+ if (tail_id >= 0) {
519
+ llama_kv_cell & cell = cache.cells[tail_id];
520
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
521
+ cell.pos += delta;
522
+ }
523
+ }
524
+ }
525
+ return;
526
+ }
527
+
528
+ for (uint32_t i = 0; i < cache.size; ++i) {
529
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
530
+ cache.has_shift = true;
531
+ cache.cells[i].pos += delta;
532
+ cache.cells[i].delta += delta;
533
+
534
+ if (cache.cells[i].pos < 0) {
535
+ if (!cache.cells[i].is_empty()) {
536
+ cache.used--;
537
+ }
538
+ cache.cells[i].pos = -1;
539
+ cache.cells[i].seq_id.clear();
540
+ if (new_head == cache.size) {
541
+ new_head = i;
542
+ }
543
+ }
544
+ }
545
+ }
546
+
547
+ // If we freed up a slot, set head to it so searching can start there.
548
+ // Otherwise we just start the next search from the beginning.
549
+ cache.head = new_head != cache.size ? new_head : 0;
550
+ }
551
+
552
+ void llama_kv_cache_seq_div(
553
+ struct llama_kv_cache & cache,
554
+ llama_seq_id seq_id,
555
+ llama_pos p0,
556
+ llama_pos p1,
557
+ int d) {
558
+ if (p0 < 0) p0 = 0;
559
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
560
+ // If there is no range then return early to avoid looping over the cache.
561
+ if (p0 == p1) return;
562
+
563
+ if (cache.recurrent) {
564
+ // for Mamba-like or RWKV models, only the pos needs to be changed
565
+ if (0 <= seq_id && seq_id < (int64_t) cache.size) {
566
+ const int32_t tail_id = cache.cells[seq_id].tail;
567
+ if (tail_id >= 0) {
568
+ llama_kv_cell & cell = cache.cells[tail_id];
569
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
570
+ cell.pos /= d;
571
+ }
572
+ }
573
+ }
574
+ return;
575
+ }
576
+
577
+ for (uint32_t i = 0; i < cache.size; ++i) {
578
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
579
+ cache.has_shift = true;
580
+
581
+ {
582
+ llama_pos p_old = cache.cells[i].pos;
583
+ cache.cells[i].pos /= d;
584
+ cache.cells[i].delta += cache.cells[i].pos - p_old;
585
+ }
586
+ }
587
+ }
588
+ }
589
+
590
+ llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
591
+ llama_pos result = 0;
592
+
593
+ for (uint32_t i = 0; i < cache.size; ++i) {
594
+ if (cache.cells[i].has_seq_id(seq_id)) {
595
+ result = std::max(result, cache.cells[i].pos);
596
+ }
597
+ }
598
+
599
+ return result;
600
+ }
601
+
602
+ void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
603
+ if (!cache.recurrent) {
604
+ cache.do_defrag = true;
605
+ }
606
+ }
607
+
608
+ int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
609
+ int result = 0;
610
+
611
+ for (uint32_t i = 0; i < kv.size; i++) {
612
+ result += kv.cells[i].seq_id.size();
613
+ }
614
+
615
+ return result;
616
+ }
617
+
618
+ int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
619
+ return kv.used;
620
+ }
621
+
622
+ bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
623
+ return kv.can_shift;
624
+ }
625
+
626
+ //
627
+ // kv cache view
628
+ //
629
+
630
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
631
+ struct llama_kv_cache_view result = {
632
+ /*.n_cells = */ 0,
633
+ /*.n_seq_max = */ n_seq_max,
634
+ /*.token_count = */ 0,
635
+ /*.used_cells = */ llama_get_kv_cache_used_cells(kv),
636
+ /*.max_contiguous = */ 0,
637
+ /*.max_contiguous_idx = */ -1,
638
+ /*.cells = */ nullptr,
639
+ /*.cells_sequences = */ nullptr,
640
+ };
641
+
642
+ return result;
643
+ }
644
+
645
+ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
646
+ if (view->cells != nullptr) {
647
+ free(view->cells);
648
+ view->cells = nullptr;
649
+ }
650
+ if (view->cells_sequences != nullptr) {
651
+ free(view->cells_sequences);
652
+ view->cells_sequences = nullptr;
653
+ }
654
+ }
655
+
656
+ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
657
+ if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
658
+ view->n_cells = int32_t(kv.size);
659
+ void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
660
+ LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
661
+ view->cells = (struct llama_kv_cache_view_cell *)p;
662
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
663
+ LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
664
+ view->cells_sequences = (llama_seq_id *)p;
665
+ }
666
+
667
+ const std::vector<llama_kv_cell> & kv_cells = kv.cells;
668
+ llama_kv_cache_view_cell * c_curr = view->cells;
669
+ llama_seq_id * cs_curr = view->cells_sequences;
670
+ int32_t used_cells = 0;
671
+ int32_t token_count = 0;
672
+ int32_t curr_contig_idx = -1;
673
+ uint32_t max_contig = 0;
674
+ int32_t max_contig_idx = -1;
675
+
676
+ for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
677
+ const size_t curr_size = kv_cells[i].seq_id.size();
678
+ token_count += curr_size;
679
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
680
+
681
+ if (curr_size > 0) {
682
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
683
+ max_contig = i - curr_contig_idx;
684
+ max_contig_idx = curr_contig_idx;
685
+ }
686
+ curr_contig_idx = -1;
687
+ } else if (curr_contig_idx < 0) {
688
+ curr_contig_idx = i;
689
+ }
690
+
691
+ int seq_idx = 0;
692
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
693
+ if (seq_idx >= view->n_seq_max) {
694
+ break;
695
+ }
696
+ cs_curr[seq_idx] = it;
697
+ seq_idx++;
698
+ }
699
+ if (seq_idx != 0) {
700
+ used_cells++;
701
+ }
702
+ for (; seq_idx < view->n_seq_max; seq_idx++) {
703
+ cs_curr[seq_idx] = -1;
704
+ }
705
+ }
706
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
707
+ max_contig_idx = curr_contig_idx;
708
+ max_contig = kv_cells.size() - curr_contig_idx;
709
+ }
710
+ view->max_contiguous = max_contig;
711
+ view->max_contiguous_idx = max_contig_idx;
712
+ view->token_count = token_count;
713
+ view->used_cells = used_cells;
714
+ if (uint32_t(used_cells) != kv.used) {
715
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
716
+ __func__, kv.used, used_cells);
717
+ }
718
+ }