@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -23,32 +23,21 @@ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
23
23
  }
24
24
 
25
25
  llama_kv_cache_unified::llama_kv_cache_unified(
26
- const llama_model & model,
27
- ggml_type type_k,
28
- ggml_type type_v,
29
- bool v_trans,
30
- bool offload,
31
- uint32_t kv_size,
32
- uint32_t padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
33
- const int32_t n_layer = hparams.n_layer;
34
-
35
- has_shift = false;
36
- can_shift = true;
37
-
38
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
39
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
40
-
41
- GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
42
-
43
- head = 0;
44
- size = kv_size;
45
- used = 0;
46
-
47
- this->type_k = type_k;
48
- this->type_v = type_v;
49
-
50
- cells.clear();
51
- cells.resize(kv_size);
26
+ const llama_model & model,
27
+ layer_filter_cb && filter,
28
+ ggml_type type_k,
29
+ ggml_type type_v,
30
+ bool v_trans,
31
+ bool offload,
32
+ uint32_t kv_size,
33
+ uint32_t n_seq_max,
34
+ uint32_t n_pad,
35
+ uint32_t n_swa,
36
+ llama_swa_type swa_type) :
37
+ model(model), hparams(model.hparams), v_trans(v_trans),
38
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
39
+
40
+ GGML_ASSERT(kv_size % n_pad == 0);
52
41
 
53
42
  // create a context for each buffer type
54
43
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -56,7 +45,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
56
45
  auto it = ctx_map.find(buft);
57
46
  if (it == ctx_map.end()) {
58
47
  ggml_init_params params = {
59
- /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
48
+ /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
60
49
  /*.mem_buffer =*/ NULL,
61
50
  /*.no_alloc =*/ true,
62
51
  };
@@ -75,37 +64,50 @@ llama_kv_cache_unified::llama_kv_cache_unified(
75
64
  return it->second;
76
65
  };
77
66
 
78
- k_l.reserve(n_layer);
79
- v_l.reserve(n_layer);
67
+ head = 0;
68
+ size = kv_size;
69
+ used = 0;
80
70
 
81
- for (int i = 0; i < n_layer; i++) {
82
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
83
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
71
+ cells.resize(kv_size);
72
+
73
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
74
+ if (filter && !filter(il)) {
75
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
76
+ continue;
77
+ }
78
+
79
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
80
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
84
81
 
85
82
  const char * dev_name = "CPU";
86
83
 
87
84
  ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
88
85
 
89
86
  if (offload) {
90
- auto * dev = model.dev_layer(i);
87
+ auto * dev = model.dev_layer(il);
91
88
  buft = ggml_backend_dev_buffer_type(dev);
92
89
 
93
90
  dev_name = ggml_backend_dev_name(dev);
94
91
  }
95
92
 
96
- LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
93
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
97
94
 
98
95
  ggml_context * ctx = ctx_for_buft(buft);
99
96
  if (!ctx) {
100
97
  throw std::runtime_error("failed to create ggml context for kv cache");
101
98
  }
102
99
 
103
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
104
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
105
- ggml_format_name(k, "cache_k_l%d", i);
106
- ggml_format_name(v, "cache_v_l%d", i);
107
- k_l.push_back(k);
108
- v_l.push_back(v);
100
+ ggml_tensor * k;
101
+ ggml_tensor * v;
102
+
103
+ k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
104
+ v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
105
+
106
+ ggml_format_name(k, "cache_k_l%d", il);
107
+ ggml_format_name(v, "cache_v_l%d", il);
108
+
109
+ map_layer_ids[il] = layers.size();
110
+ layers.push_back({ il, k, v });
109
111
  }
110
112
 
111
113
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -117,8 +119,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
117
119
  if (!buf) {
118
120
  throw std::runtime_error("failed to allocate buffer for kv cache");
119
121
  }
120
- ggml_backend_buffer_clear(buf, 0);
122
+
121
123
  LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
124
+
125
+ ggml_backend_buffer_clear(buf, 0);
122
126
  bufs.emplace_back(buf);
123
127
  }
124
128
 
@@ -126,18 +130,19 @@ llama_kv_cache_unified::llama_kv_cache_unified(
126
130
  const size_t memory_size_k = size_k_bytes();
127
131
  const size_t memory_size_v = size_v_bytes();
128
132
 
129
- LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
130
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
133
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
134
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
131
135
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
132
136
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
133
137
  }
134
138
  }
135
139
 
136
140
  void llama_kv_cache_unified::clear() {
137
- for (int32_t i = 0; i < (int32_t) size; ++i) {
141
+ for (uint32_t i = 0; i < size; ++i) {
138
142
  cells[i].pos = -1;
139
143
  cells[i].seq_id.clear();
140
144
  }
145
+
141
146
  head = 0;
142
147
  used = 0;
143
148
 
@@ -166,6 +171,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
166
171
  } else {
167
172
  continue;
168
173
  }
174
+
169
175
  if (cells[i].is_empty()) {
170
176
  // keep count of the number of used cells
171
177
  if (cells[i].pos >= 0) {
@@ -262,6 +268,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
262
268
  for (uint32_t i = 0; i < size; ++i) {
263
269
  if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
264
270
  has_shift = true;
271
+
265
272
  cells[i].pos += delta;
266
273
  cells[i].delta += delta;
267
274
 
@@ -314,53 +321,60 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
314
321
  }
315
322
  }
316
323
 
317
- llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
318
- llama_pos result = 0;
324
+ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
325
+ llama_pos result = std::numeric_limits<llama_pos>::max();
319
326
 
320
327
  for (uint32_t i = 0; i < size; ++i) {
321
328
  if (cells[i].has_seq_id(seq_id)) {
322
- result = std::max(result, cells[i].pos);
329
+ result = std::min(result, cells[i].pos);
323
330
  }
324
331
  }
325
332
 
333
+ if (result == std::numeric_limits<llama_pos>::max()) {
334
+ result = -1;
335
+ }
336
+
326
337
  return result;
327
338
  }
328
339
 
329
- void llama_kv_cache_unified::restore() {
330
- if (pending.ranges.empty()) {
331
- return;
332
- }
340
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
341
+ llama_pos result = -1;
333
342
 
334
- uint32_t new_head = size;
343
+ for (uint32_t i = 0; i < size; ++i) {
344
+ if (cells[i].has_seq_id(seq_id)) {
345
+ result = std::max(result, cells[i].pos);
346
+ }
347
+ }
335
348
 
336
- for (auto & range : pending.ranges) {
337
- for (uint32_t i = range.c0; i < range.c1; ++i) {
338
- cells[i].seq_id.clear();
349
+ return result;
350
+ }
339
351
 
340
- // keep count of the number of used cells
341
- if (cells[i].pos >= 0) {
342
- used--;
343
- }
352
+ void llama_kv_cache_unified::restore() {
353
+ for (const auto & [id, cell] : recovery.cells) {
354
+ // TODO: move to new `struct kv_cells`
355
+ const bool is_empty0 = cells[id].is_empty();
356
+ const bool is_empty1 = cell.is_empty();
344
357
 
345
- cells[i].pos = -1;
358
+ if (!is_empty0 && is_empty1) {
359
+ used--;
360
+ } else if (is_empty0 && !is_empty1) {
361
+ used++;
346
362
  }
347
363
 
348
- new_head = std::min(new_head, range.c0);
364
+ cells[id] = cell;
349
365
  }
350
366
 
351
- if (new_head != size && new_head < head) {
352
- head = new_head;
353
- }
367
+ recovery.clear();
354
368
  }
355
369
 
356
370
  void llama_kv_cache_unified::commit() {
357
- if (pending.ranges.empty()) {
358
- LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
359
- __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
371
+ if (recovery.cells.empty()) {
372
+ LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
373
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
360
374
  return;
361
375
  }
362
376
 
363
- pending.ranges.clear();
377
+ recovery.clear();
364
378
  }
365
379
 
366
380
  bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -429,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
429
443
  void llama_kv_cache_unified::defrag_sched(float thold) {
430
444
  // - do not defrag small contexts (i.e. < 2048 tokens)
431
445
  // - count the padding towards the number of used tokens
432
- const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
446
+ const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
433
447
 
434
448
  // queue defragmentation for next llama_kv_cache_update
435
449
  if (fragmentation > thold) {
@@ -450,25 +464,17 @@ void llama_kv_cache_unified::set_full() {
450
464
  head = 0;
451
465
  }
452
466
 
453
- llama_sbatch llama_kv_cache_unified::sbatch_init(
454
- const llama_batch & batch,
455
- bool logits_all) {
467
+ llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
456
468
  return llama_sbatch(batch, hparams.n_embd, true, logits_all);
457
469
  }
458
470
 
459
- llama_ubatch llama_kv_cache_unified::ubatch_next(
460
- llama_sbatch & sbatch,
461
- uint32_t n_ubatch,
462
- bool embd_pooled) const {
471
+ llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
463
472
  GGML_UNUSED(embd_pooled);
464
473
  return sbatch.split_simple(n_ubatch);
465
474
  }
466
475
 
467
- bool llama_kv_cache_unified::find_slot(
468
- const llama_ubatch & ubatch) {
476
+ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
469
477
  const uint32_t n_tokens = ubatch.n_tokens;
470
- const uint32_t n_seqs = ubatch.n_seqs;
471
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
472
478
 
473
479
  // if we have enough unused cells before the current head ->
474
480
  // better to start searching from the beginning of the cache, hoping to fill it
@@ -483,6 +489,29 @@ bool llama_kv_cache_unified::find_slot(
483
489
  return false;
484
490
  }
485
491
 
492
+ //#define FIND_SLOT_DEBUG 1
493
+ #if FIND_SLOT_DEBUG
494
+ LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
495
+
496
+ // for debugging
497
+ {
498
+ std::string ss;
499
+ if (n_swa > 0) {
500
+ for (uint32_t i = 0; i < size; ++i) {
501
+ if (cells[i].pos == -1) {
502
+ ss += '.';
503
+ } else {
504
+ ss += std::to_string(*cells[i].seq_id.begin());
505
+ }
506
+ if (i%256 == 255) {
507
+ ss += '\n';
508
+ }
509
+ }
510
+ }
511
+ LLAMA_LOG_WARN("\n%s\n", ss.c_str());
512
+ }
513
+ #endif
514
+
486
515
  uint32_t n_tested = 0;
487
516
 
488
517
  while (true) {
@@ -512,60 +541,257 @@ bool llama_kv_cache_unified::find_slot(
512
541
  }
513
542
  }
514
543
 
515
- for (uint32_t s = 0; s < n_seqs; s++) {
516
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
517
- uint32_t k = s*n_seq_tokens + i;
518
- cells[head + k].pos = ubatch.pos[k];
544
+ for (uint32_t i = 0; i < n_tokens; ++i) {
545
+ // remember the original state
546
+ if (recovery.cells.find(head + i) == recovery.cells.end()) {
547
+ recovery.cells[head + i] = cells[head + i];
548
+ }
519
549
 
520
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
521
- cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
522
- }
550
+ cells[head + i].pos = ubatch.pos[i];
551
+
552
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
553
+ cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
523
554
  }
524
555
  }
525
556
 
526
557
  used += n_tokens;
527
558
 
528
- pending.ranges.push_back({head, head + n_tokens});
529
-
530
559
  // a heuristic, to avoid attending the full cache if it is not yet utilized
531
560
  // after enough generations, the benefit from this heuristic disappears
532
561
  // if we start defragmenting the cache, the benefit from this will be more important
533
- n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
562
+ n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
534
563
 
535
- //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
564
+ #ifdef FIND_SLOT_DEBUG
565
+ LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
566
+ #endif
536
567
 
537
568
  return true;
538
569
  }
539
570
 
540
- int32_t llama_kv_cache_unified::get_n_tokens() const {
541
- int32_t result = 0;
571
+ bool llama_kv_cache_unified::get_can_shift() const {
572
+ return true;
573
+ }
542
574
 
543
- for (uint32_t i = 0; i < size; i++) {
544
- result += cells[i].seq_id.size();
575
+ uint32_t llama_kv_cache_unified::get_n() const {
576
+ return n;
577
+ }
578
+
579
+ uint32_t llama_kv_cache_unified::get_size() const {
580
+ return size;
581
+ }
582
+
583
+ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
584
+ const int32_t ikv = map_layer_ids.at(il);
585
+
586
+ auto * k = layers[ikv].k;
587
+
588
+ return ggml_view_3d(ctx, k,
589
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n,
590
+ ggml_row_size(k->type, hparams.n_embd_head_k),
591
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
592
+ 0);
593
+ }
594
+
595
+ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
596
+ const int32_t ikv = map_layer_ids.at(il);
597
+
598
+ auto * v = layers[ikv].v;
599
+
600
+ if (!v_trans) {
601
+ // note: v->nb[1] <= v->nb[2]
602
+ return ggml_view_3d(ctx, v,
603
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n,
604
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
605
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
606
+ 0);
545
607
  }
546
608
 
547
- return result;
609
+ // note: v->nb[1] > v->nb[2]
610
+ return ggml_view_3d(ctx, v,
611
+ n, hparams.n_head_kv(il), hparams.n_embd_head_v,
612
+ ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
613
+ ggml_row_size(v->type, v->ne[1]), // v->nb[2]
614
+ 0);
548
615
  }
549
616
 
550
- int32_t llama_kv_cache_unified::get_used_cells() const {
551
- return used;
617
+ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
618
+ const int32_t ikv = map_layer_ids.at(il);
619
+
620
+ auto * k = layers[ikv].k;
621
+
622
+ const int64_t n_tokens = k_cur->ne[2];
623
+
624
+ ggml_tensor * k_view = ggml_view_1d(ctx, k,
625
+ n_tokens*hparams.n_embd_k_gqa(il),
626
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
627
+
628
+ return ggml_cpy(ctx, k_cur, k_view);
552
629
  }
553
630
 
554
- bool llama_kv_cache_unified::get_can_shift() const {
555
- return can_shift;
631
+ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
632
+ const int32_t ikv = map_layer_ids.at(il);
633
+
634
+ auto * v = layers[ikv].v;
635
+
636
+ const int64_t n_tokens = v_cur->ne[2];
637
+
638
+ v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
639
+
640
+ ggml_tensor * v_view = nullptr;
641
+
642
+ if (!v_trans) {
643
+ v_view = ggml_view_1d(ctx, v,
644
+ n_tokens*hparams.n_embd_v_gqa(il),
645
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
646
+ } else {
647
+ // note: the V cache is transposed when not using flash attention
648
+ v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
649
+ (v->ne[1])*ggml_element_size(v),
650
+ ( head)*ggml_element_size(v));
651
+
652
+ v_cur = ggml_transpose(ctx, v_cur);
653
+ }
654
+
655
+ return ggml_cpy(ctx, v_cur, v_view);
656
+ }
657
+
658
+ void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
659
+ // no pruning is needed when the cache does not use SWA
660
+ GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
661
+
662
+ int n_attended = 0;
663
+
664
+ for (uint32_t i = 0; i < size; ++i) {
665
+ const llama_pos p0 = cells[i].pos;
666
+
667
+ if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
668
+ n_attended++;
669
+ }
670
+
671
+ if (is_masked_swa(p0, pmax)) {
672
+ if (seq_id < 0) {
673
+ cells[i].seq_id.clear();
674
+ } else if (cells[i].has_seq_id(seq_id)) {
675
+ cells[i].seq_id.erase(seq_id);
676
+ } else {
677
+ continue;
678
+ }
679
+
680
+ if (cells[i].is_empty()) {
681
+ // keep count of the number of used cells
682
+ if (cells[i].pos >= 0) {
683
+ used--;
684
+ }
685
+
686
+ cells[i].pos = -1;
687
+ }
688
+ }
689
+ }
690
+
691
+ if (n_attended < std::min<int>(n_swa, pmin)) {
692
+ LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
693
+ }
694
+ }
695
+
696
+ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
697
+ const int64_t n_tokens = ubatch->n_tokens;
698
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
699
+ const int64_t n_seqs = ubatch->n_seqs;
700
+
701
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
702
+ float * data = (float *) dst->data;
703
+
704
+ const int64_t n_kv = n;
705
+
706
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
707
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
708
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
709
+ // Causal mask:
710
+ // xxx-------
711
+ // xxxx------
712
+ // xxxxx-----
713
+ // Non-causal mask:
714
+ // xxxxx-----
715
+ // xxxxx-----
716
+ // xxxxx-----
717
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
718
+ for (int h = 0; h < 1; ++h) {
719
+ for (int s = 0; s < n_seqs; ++s) {
720
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
721
+
722
+ for (int j = 0; j < n_seq_tokens; ++j) {
723
+ const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
724
+
725
+ for (int i = 0; i < n_kv; ++i) {
726
+ const llama_pos p0 = cells[i].pos;
727
+
728
+ bool masked = false;
729
+
730
+ // mask the token if not the same sequence
731
+ masked = masked || (!cells[i].has_seq_id(seq_id));
732
+
733
+ // mask future tokens
734
+ masked = masked || (causal_attn && p0 > p1);
735
+
736
+ // apply SWA if any
737
+ masked = masked || (is_masked_swa(p0, p1));
738
+
739
+ float f = 0.0f;
740
+
741
+ if (masked) {
742
+ f = -INFINITY;
743
+ } else if (hparams.use_alibi) {
744
+ f = -std::abs(p0 - p1);
745
+ }
746
+
747
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
748
+ }
749
+ }
750
+ }
751
+
752
+ // mask padded tokens
753
+ if (data) {
754
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
755
+ for (int j = 0; j < n_kv; ++j) {
756
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
757
+ }
758
+ }
759
+ }
760
+ }
556
761
  }
557
762
 
558
- llama_pos llama_kv_cache_unified::get_pos_max() const {
559
- llama_pos pos_max = -1;
560
- for (const auto & cell : cells) {
561
- pos_max = std::max(pos_max, cell.pos);
763
+ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
764
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
765
+
766
+ int32_t * data = (int32_t *) dst->data;
767
+
768
+ for (uint32_t i = 0; i < size; ++i) {
769
+ data[i] = cells[i].delta;
562
770
  }
771
+ }
772
+
773
+ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
774
+ const int64_t n_tokens = ubatch->n_tokens;
563
775
 
564
- return pos_max;
776
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
777
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
778
+
779
+ int32_t * data = (int32_t *) dst->data;
780
+
781
+ const int64_t n_kv = n;
782
+
783
+ for (int h = 0; h < 1; ++h) {
784
+ for (int j = 0; j < n_tokens; ++j) {
785
+ for (int i = 0; i < n_kv; ++i) {
786
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
787
+ }
788
+ }
789
+ }
565
790
  }
566
791
 
567
792
  size_t llama_kv_cache_unified::total_size() const {
568
793
  size_t size = 0;
794
+
569
795
  for (const auto & buf : bufs) {
570
796
  size += ggml_backend_buffer_get_size(buf.get());
571
797
  }
@@ -576,8 +802,8 @@ size_t llama_kv_cache_unified::total_size() const {
576
802
  size_t llama_kv_cache_unified::size_k_bytes() const {
577
803
  size_t size_k_bytes = 0;
578
804
 
579
- for (const auto & k : k_l) {
580
- size_k_bytes += ggml_nbytes(k);
805
+ for (const auto & layer : layers) {
806
+ size_k_bytes += ggml_nbytes(layer.k);
581
807
  }
582
808
 
583
809
  return size_k_bytes;
@@ -586,8 +812,8 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
586
812
  size_t llama_kv_cache_unified::size_v_bytes() const {
587
813
  size_t size_v_bytes = 0;
588
814
 
589
- for (const auto & v : v_l) {
590
- size_v_bytes += ggml_nbytes(v);
815
+ for (const auto & layer : layers) {
816
+ size_v_bytes += ggml_nbytes(layer.v);
591
817
  }
592
818
 
593
819
  return size_v_bytes;
@@ -651,13 +877,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
651
877
  GGML_UNUSED(ubatch);
652
878
 
653
879
  if (k_shift) {
654
- assert(ggml_backend_buffer_is_host(k_shift->buffer));
655
-
656
- int32_t * data = (int32_t *) k_shift->data;
657
-
658
- for (uint32_t i = 0; i < kv_self->size; ++i) {
659
- data[i] = kv_self->cells[i].delta;
660
- }
880
+ kv_self->set_input_k_shift(k_shift);
661
881
  }
662
882
  }
663
883
 
@@ -667,13 +887,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
667
887
  ggml_cgraph * gf) const {
668
888
  auto res = std::make_unique<llm_graph_result>();
669
889
 
670
- const auto & n_layer = hparams.n_layer;
671
-
672
890
  const auto & n_embd_head_k = hparams.n_embd_head_k;
673
891
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
674
892
 
675
- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
676
-
677
893
  //GGML_ASSERT(kv_self->size == n_ctx);
678
894
 
679
895
  auto inp = std::make_unique<llm_graph_input_k_shift>(this);
@@ -681,24 +897,22 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
681
897
  inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
682
898
  ggml_set_input(inp->k_shift);
683
899
 
684
- for (uint32_t il = 0; il < n_layer; ++il) {
900
+ for (const auto & layer : layers) {
901
+ const uint32_t il = layer.il;
902
+
685
903
  const int64_t n_head_kv = hparams.n_head_kv(il);
686
904
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
687
905
 
688
- const bool is_swa = hparams.is_swa(il);
689
-
690
- // note: the swa rope params could become part of the cparams in the future
691
- // if we decide to make them configurable, like the non-sliding ones
692
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
693
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
906
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
907
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
694
908
 
695
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
909
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
696
910
 
697
911
  ggml_tensor * k =
698
- ggml_view_3d(ctx, k_l[il],
912
+ ggml_view_3d(ctx, layer.k,
699
913
  n_embd_head_k, n_head_kv, size,
700
- ggml_row_size(k_l[il]->type, n_embd_head_k),
701
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
914
+ ggml_row_size(layer.k->type, n_embd_head_k),
915
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
702
916
  0);
703
917
 
704
918
  ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
@@ -803,44 +1017,46 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
803
1017
  nm++;
804
1018
  }
805
1019
 
806
- for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
1020
+ for (const auto & layer : layers) {
1021
+ const uint32_t il = layer.il;
1022
+
807
1023
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
808
1024
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
809
1025
 
810
- ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
1026
+ ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
811
1027
  n_embd_k_gqa, nm,
812
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
813
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
1028
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
1029
+ ggml_row_size(layer.k->type, n_embd_k_gqa*i));
814
1030
 
815
- ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
1031
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
816
1032
  n_embd_k_gqa, nm,
817
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
818
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
1033
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
1034
+ ggml_row_size(layer.k->type, n_embd_k_gqa*id));
819
1035
 
820
1036
  ggml_tensor * view_v_src;
821
1037
  ggml_tensor * view_v_dst;
822
1038
 
823
1039
  if (cparams.flash_attn) {
824
1040
  // NOTE: the V cache is not transposed when using flash attention
825
- view_v_src = ggml_view_2d(ctx, v_l[il],
1041
+ view_v_src = ggml_view_2d(ctx, layer.v,
826
1042
  n_embd_v_gqa, nm,
827
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
828
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
1043
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
1044
+ ggml_row_size(layer.v->type, n_embd_v_gqa*i));
829
1045
 
830
- view_v_dst = ggml_view_2d(ctx, v_l[il],
1046
+ view_v_dst = ggml_view_2d(ctx, layer.v,
831
1047
  n_embd_v_gqa, nm,
832
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
833
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
1048
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
1049
+ ggml_row_size(layer.v->type, n_embd_v_gqa*id));
834
1050
  } else {
835
- view_v_src = ggml_view_2d(ctx, v_l[il],
1051
+ view_v_src = ggml_view_2d(ctx, layer.v,
836
1052
  nm, n_embd_v_gqa,
837
- ggml_row_size(v_l[il]->type, size),
838
- ggml_row_size(v_l[il]->type, i));
1053
+ ggml_row_size(layer.v->type, size),
1054
+ ggml_row_size(layer.v->type, i));
839
1055
 
840
- view_v_dst = ggml_view_2d(ctx, v_l[il],
1056
+ view_v_dst = ggml_view_2d(ctx, layer.v,
841
1057
  nm, n_embd_v_gqa,
842
- ggml_row_size(v_l[il]->type, size),
843
- ggml_row_size(v_l[il]->type, id));
1058
+ ggml_row_size(layer.v->type, size),
1059
+ ggml_row_size(layer.v->type, id));
844
1060
  }
845
1061
 
846
1062
  ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
@@ -857,7 +1073,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
857
1073
  }
858
1074
 
859
1075
  bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
860
- const uint32_t n_layer = hparams.n_layer;
1076
+ const uint32_t n_layer = layers.size();
861
1077
 
862
1078
  const uint32_t n_kv = cell_max();
863
1079
  const uint32_t n_used = used;
@@ -1005,6 +1221,34 @@ uint32_t llama_kv_cache_unified::cell_max() const {
1005
1221
  return 0;
1006
1222
  }
1007
1223
 
1224
+ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1225
+ if (p0 < 0) {
1226
+ return true;
1227
+ }
1228
+
1229
+ switch (swa_type) {
1230
+ case LLAMA_SWA_TYPE_NONE:
1231
+ {
1232
+ } break;
1233
+ case LLAMA_SWA_TYPE_STANDARD:
1234
+ {
1235
+ if (p1 - p0 >= (int32_t) n_swa) {
1236
+ return true;
1237
+ }
1238
+ } break;
1239
+ case LLAMA_SWA_TYPE_CHUNKED:
1240
+ {
1241
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1242
+
1243
+ if (p0 < pos_chunk_start) {
1244
+ return true;
1245
+ }
1246
+ } break;
1247
+ }
1248
+
1249
+ return false;
1250
+ }
1251
+
1008
1252
  void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1009
1253
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
1010
1254
  uint32_t cell_count = 0;
@@ -1082,7 +1326,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::
1082
1326
 
1083
1327
  void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
1084
1328
  const uint32_t v_trans = this->v_trans ? 1 : 0;
1085
- const uint32_t n_layer = hparams.n_layer;
1329
+ const uint32_t n_layer = layers.size();
1086
1330
 
1087
1331
  io.write(&v_trans, sizeof(v_trans));
1088
1332
  io.write(&n_layer, sizeof(n_layer));
@@ -1091,56 +1335,63 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1091
1335
 
1092
1336
  // Iterate and write all the keys first, each row is a cell
1093
1337
  // Get whole range at a time
1094
- for (uint32_t il = 0; il < n_layer; ++il) {
1338
+ for (const auto & layer : layers) {
1339
+ const uint32_t il = layer.il;
1340
+
1095
1341
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1096
1342
 
1097
1343
  // Write key type
1098
- const int32_t k_type_i = (int32_t)k_l[il]->type;
1344
+ const int32_t k_type_i = (int32_t)layer.k->type;
1099
1345
  io.write(&k_type_i, sizeof(k_type_i));
1100
1346
 
1101
1347
  // Write row size of key
1102
- const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1348
+ const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
1103
1349
  io.write(&k_size_row, sizeof(k_size_row));
1104
1350
 
1105
1351
  // Read each range of cells of k_size length each into tmp_buf and write out
1106
1352
  for (const auto & range : cell_ranges) {
1107
1353
  const size_t range_size = range.second - range.first;
1108
1354
  const size_t buf_size = range_size * k_size_row;
1109
- io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
1355
+ io.write_tensor(layer.k, range.first * k_size_row, buf_size);
1110
1356
  }
1111
1357
  }
1112
1358
 
1113
1359
  if (!v_trans) {
1114
- for (uint32_t il = 0; il < n_layer; ++il) {
1360
+ for (const auto & layer : layers) {
1361
+ const uint32_t il = layer.il;
1362
+
1115
1363
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1116
1364
 
1117
1365
  // Write value type
1118
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1366
+ const int32_t v_type_i = (int32_t)layer.v->type;
1119
1367
  io.write(&v_type_i, sizeof(v_type_i));
1120
1368
 
1121
1369
  // Write row size of value
1122
- const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1370
+ const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
1123
1371
  io.write(&v_size_row, sizeof(v_size_row));
1124
1372
 
1125
1373
  // Read each range of cells of v_size length each into tmp_buf and write out
1126
1374
  for (const auto & range : cell_ranges) {
1127
1375
  const size_t range_size = range.second - range.first;
1128
1376
  const size_t buf_size = range_size * v_size_row;
1129
- io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
1377
+ io.write_tensor(layer.v, range.first * v_size_row, buf_size);
1130
1378
  }
1131
1379
  }
1132
1380
  } else {
1133
1381
  // When v is transposed, we also need the element size and get the element ranges from each row
1134
1382
  const uint32_t kv_size = size;
1135
- for (uint32_t il = 0; il < n_layer; ++il) {
1383
+
1384
+ for (const auto & layer : layers) {
1385
+ const uint32_t il = layer.il;
1386
+
1136
1387
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1137
1388
 
1138
1389
  // Write value type
1139
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1390
+ const int32_t v_type_i = (int32_t)layer.v->type;
1140
1391
  io.write(&v_type_i, sizeof(v_type_i));
1141
1392
 
1142
1393
  // Write element size
1143
- const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
1394
+ const uint32_t v_size_el = ggml_type_size(layer.v->type);
1144
1395
  io.write(&v_size_el, sizeof(v_size_el));
1145
1396
 
1146
1397
  // Write GQA embedding size
@@ -1153,7 +1404,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1153
1404
  const size_t range_size = range.second - range.first;
1154
1405
  const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1155
1406
  const size_t buf_size = range_size * v_size_el;
1156
- io.write_tensor(v_l[il], src_offset, buf_size);
1407
+ io.write_tensor(layer.v, src_offset, buf_size);
1157
1408
  }
1158
1409
  }
1159
1410
  }
@@ -1170,8 +1421,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1170
1421
  llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1171
1422
 
1172
1423
  batch.n_tokens = cell_count;
1173
- batch.n_seq_tokens = cell_count;
1174
- batch.n_seqs = 1;
1175
1424
 
1176
1425
  for (uint32_t i = 0; i < cell_count; ++i) {
1177
1426
  llama_pos pos;
@@ -1186,13 +1435,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1186
1435
  }
1187
1436
 
1188
1437
  batch.pos[i] = pos;
1438
+ batch.n_seq_id[i] = 1;
1439
+ batch.seq_id[i] = &dest_seq_id;
1189
1440
  }
1190
- batch.n_seq_id[0] = 1;
1191
- batch.seq_id[0] = &dest_seq_id;
1441
+
1192
1442
  if (!find_slot(batch)) {
1193
1443
  LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1194
1444
  return false;
1195
1445
  }
1446
+
1196
1447
  commit();
1197
1448
 
1198
1449
  // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
@@ -1227,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1227
1478
  llama_seq_id seq_id;
1228
1479
  io.read_to(&seq_id, sizeof(seq_id));
1229
1480
 
1230
- // TODO: llama_kv_cache_unified should have a notion of max sequences
1231
- //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1232
- if (seq_id < 0) {
1233
- //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1234
- LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
1481
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
1482
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
1235
1483
  return false;
1236
1484
  }
1237
1485
 
@@ -1249,11 +1497,12 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1249
1497
  bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1250
1498
  uint32_t v_trans;
1251
1499
  uint32_t n_layer;
1500
+
1252
1501
  io.read_to(&v_trans, sizeof(v_trans));
1253
1502
  io.read_to(&n_layer, sizeof(n_layer));
1254
1503
 
1255
- if (n_layer != hparams.n_layer) {
1256
- LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
1504
+ if (n_layer != layers.size()) {
1505
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
1257
1506
  return false;
1258
1507
  }
1259
1508
  if (cell_count > size) {
@@ -1266,13 +1515,15 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1266
1515
  }
1267
1516
 
1268
1517
  // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1269
- for (uint32_t il = 0; il < n_layer; ++il) {
1518
+ for (const auto & layer : layers) {
1519
+ const uint32_t il = layer.il;
1520
+
1270
1521
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1271
1522
 
1272
1523
  // Read type of key
1273
1524
  int32_t k_type_i_ref;
1274
1525
  io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1275
- const int32_t k_type_i = (int32_t) k_l[il]->type;
1526
+ const int32_t k_type_i = (int32_t) layer.k->type;
1276
1527
  if (k_type_i != k_type_i_ref) {
1277
1528
  LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1278
1529
  return false;
@@ -1281,7 +1532,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1281
1532
  // Read row size of key
1282
1533
  uint64_t k_size_row_ref;
1283
1534
  io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1284
- const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1535
+ const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
1285
1536
  if (k_size_row != k_size_row_ref) {
1286
1537
  LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1287
1538
  return false;
@@ -1289,18 +1540,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1289
1540
 
1290
1541
  if (cell_count) {
1291
1542
  // Read and set the keys for the whole cell range
1292
- ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1543
+ ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1293
1544
  }
1294
1545
  }
1295
1546
 
1296
1547
  if (!this->v_trans) {
1297
- for (uint32_t il = 0; il < n_layer; ++il) {
1548
+ for (const auto & layer : layers) {
1549
+ const uint32_t il = layer.il;
1550
+
1298
1551
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1299
1552
 
1300
1553
  // Read type of value
1301
1554
  int32_t v_type_i_ref;
1302
1555
  io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1303
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1556
+ const int32_t v_type_i = (int32_t)layer.v->type;
1304
1557
  if (v_type_i != v_type_i_ref) {
1305
1558
  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1306
1559
  return false;
@@ -1309,7 +1562,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1309
1562
  // Read row size of value
1310
1563
  uint64_t v_size_row_ref;
1311
1564
  io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1312
- const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1565
+ const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
1313
1566
  if (v_size_row != v_size_row_ref) {
1314
1567
  LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1315
1568
  return false;
@@ -1317,18 +1570,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1317
1570
 
1318
1571
  if (cell_count) {
1319
1572
  // Read and set the values for the whole cell range
1320
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1573
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1321
1574
  }
1322
1575
  }
1323
1576
  } else {
1324
1577
  // For each layer, read the values for each cell (transposed)
1325
- for (uint32_t il = 0; il < n_layer; ++il) {
1578
+ for (const auto & layer : layers) {
1579
+ const uint32_t il = layer.il;
1580
+
1326
1581
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1327
1582
 
1328
1583
  // Read type of value
1329
1584
  int32_t v_type_i_ref;
1330
1585
  io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1331
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1586
+ const int32_t v_type_i = (int32_t)layer.v->type;
1332
1587
  if (v_type_i != v_type_i_ref) {
1333
1588
  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1334
1589
  return false;
@@ -1337,7 +1592,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1337
1592
  // Read element size of value
1338
1593
  uint32_t v_size_el_ref;
1339
1594
  io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1340
- const size_t v_size_el = ggml_type_size(v_l[il]->type);
1595
+ const size_t v_size_el = ggml_type_size(layer.v->type);
1341
1596
  if (v_size_el != v_size_el_ref) {
1342
1597
  LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1343
1598
  return false;
@@ -1355,7 +1610,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1355
1610
  // For each row in the transposed matrix, read the values for the whole cell range
1356
1611
  for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1357
1612
  const size_t dst_offset = (head + j * size) * v_size_el;
1358
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1613
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1359
1614
  }
1360
1615
  }
1361
1616
  }
@@ -1364,6 +1619,193 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1364
1619
  return true;
1365
1620
  }
1366
1621
 
1622
+ //
1623
+ // llama_kv_cache_unified_iswa
1624
+ //
1625
+
1626
+ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
1627
+ const llama_model & model,
1628
+ ggml_type type_k,
1629
+ ggml_type type_v,
1630
+ bool v_trans,
1631
+ bool offload,
1632
+ bool swa_full,
1633
+ uint32_t kv_size,
1634
+ uint32_t n_seq_max,
1635
+ uint32_t n_batch,
1636
+ uint32_t n_pad) : hparams(model.hparams) {
1637
+ llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
1638
+ llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
1639
+
1640
+ const uint32_t size_base = kv_size;
1641
+
1642
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
1643
+
1644
+ // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
1645
+ if (swa_full) {
1646
+ LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
1647
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
1648
+
1649
+ size_swa = size_base;
1650
+ do_prune = false;
1651
+ }
1652
+
1653
+ LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
1654
+
1655
+ kv_base = std::make_unique<llama_kv_cache_unified>(
1656
+ model, std::move(filter_base), type_k, type_v,
1657
+ v_trans, offload, size_base, n_seq_max, n_pad,
1658
+ 0, LLAMA_SWA_TYPE_NONE);
1659
+
1660
+ LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
1661
+
1662
+ kv_swa = std::make_unique<llama_kv_cache_unified>(
1663
+ model, std::move(filter_swa), type_k, type_v,
1664
+ v_trans, offload, size_swa, n_seq_max, n_pad,
1665
+ hparams.n_swa, hparams.swa_type);
1666
+ }
1667
+
1668
+ void llama_kv_cache_unified_iswa::clear() {
1669
+ kv_base->clear();
1670
+ kv_swa ->clear();
1671
+ }
1672
+
1673
+ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
1674
+ bool res = true;
1675
+
1676
+ res = res & kv_base->seq_rm(seq_id, p0, p1);
1677
+ res = res & kv_swa ->seq_rm(seq_id, p0, p1);
1678
+
1679
+ return res;
1680
+ }
1681
+
1682
+ void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
1683
+ kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1684
+ kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1685
+ }
1686
+
1687
+ void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
1688
+ kv_base->seq_keep(seq_id);
1689
+ kv_swa ->seq_keep(seq_id);
1690
+ }
1691
+
1692
+ void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
1693
+ kv_base->seq_add(seq_id, p0, p1, delta);
1694
+ kv_swa ->seq_add(seq_id, p0, p1, delta);
1695
+ }
1696
+
1697
+ void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
1698
+ kv_base->seq_div(seq_id, p0, p1, d);
1699
+ kv_swa ->seq_div(seq_id, p0, p1, d);
1700
+ }
1701
+
1702
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
1703
+ // the base cache is a superset of the SWA cache, so we can just check the SWA cache
1704
+ return kv_swa->seq_pos_min(seq_id);
1705
+ }
1706
+
1707
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
1708
+ return kv_swa->seq_pos_max(seq_id);
1709
+ }
1710
+
1711
+ void llama_kv_cache_unified_iswa::restore() {
1712
+ kv_base->restore();
1713
+ kv_swa ->restore();
1714
+ }
1715
+
1716
+ void llama_kv_cache_unified_iswa::commit() {
1717
+ kv_base->commit();
1718
+ kv_swa ->commit();
1719
+
1720
+ // slide the attention window, forgetting/pruning old tokens that are outside the window
1721
+ if (do_prune) {
1722
+ for (const auto & [seq_id, entry] : pending.pos) {
1723
+ kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
1724
+ }
1725
+
1726
+ }
1727
+
1728
+ pending.clear();
1729
+ }
1730
+
1731
+ bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
1732
+ bool res = true;
1733
+
1734
+ res = res & kv_base->update(lctx);
1735
+ res = res & kv_swa ->update(lctx);
1736
+
1737
+ return res;
1738
+ }
1739
+
1740
+ void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
1741
+ kv_base->defrag_sched(thold);
1742
+ kv_swa ->defrag_sched(thold);
1743
+ }
1744
+
1745
+ void llama_kv_cache_unified_iswa::set_full() {
1746
+ kv_base->set_full();
1747
+ kv_swa ->set_full();
1748
+ }
1749
+
1750
+ llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
1751
+ pending.clear();
1752
+
1753
+ if (do_prune) {
1754
+ for (int i = 0; i < batch.n_tokens; ++i) {
1755
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
1756
+ const llama_seq_id seq_id = batch.seq_id[i][s];
1757
+ const llama_pos pos = batch.pos[i];
1758
+
1759
+ if (pending.pos.find(seq_id) == pending.pos.end()) {
1760
+ pending.pos[seq_id].pmin = pos;
1761
+ pending.pos[seq_id].pmax = pos;
1762
+ } else {
1763
+ pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
1764
+ pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
1765
+ }
1766
+ }
1767
+ }
1768
+ }
1769
+
1770
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
1771
+ }
1772
+
1773
+ llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
1774
+ GGML_UNUSED(embd_pooled);
1775
+ return sbatch.split_simple(n_ubatch);
1776
+ }
1777
+
1778
+ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
1779
+ bool res = true;
1780
+
1781
+ res = res & kv_base->find_slot(batch);
1782
+ res = res & kv_swa ->find_slot(batch);
1783
+
1784
+ return res;
1785
+ }
1786
+
1787
+ bool llama_kv_cache_unified_iswa::get_can_shift() const {
1788
+ return kv_base->get_size() == kv_swa->get_size();
1789
+ }
1790
+
1791
+ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1792
+ kv_base->state_write(io, seq_id);
1793
+ kv_swa ->state_write(io, seq_id);
1794
+ }
1795
+
1796
+ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1797
+ kv_base->state_read(io, seq_id);
1798
+ kv_swa ->state_read(io, seq_id);
1799
+ }
1800
+
1801
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
1802
+ return kv_base.get();
1803
+ }
1804
+
1805
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
1806
+ return kv_swa.get();
1807
+ }
1808
+
1367
1809
  //
1368
1810
  // llama_kv_cache_recurrent
1369
1811
  //
@@ -1373,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
1373
1815
  ggml_type type_k,
1374
1816
  ggml_type type_v,
1375
1817
  bool offload,
1376
- uint32_t kv_size) : hparams(model.hparams) {
1818
+ uint32_t kv_size,
1819
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
1377
1820
  const int32_t n_layer = hparams.n_layer;
1378
1821
 
1379
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
1380
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
1822
+ LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
1823
+ __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
1381
1824
 
1382
1825
  head = 0;
1383
1826
  size = kv_size;
1384
1827
  used = 0;
1385
1828
 
1386
- this->type_k = type_k;
1387
- this->type_v = type_v;
1388
-
1389
1829
  cells.clear();
1390
1830
  cells.resize(kv_size);
1391
1831
 
@@ -1683,8 +2123,24 @@ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_
1683
2123
  }
1684
2124
  }
1685
2125
 
2126
+ llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
2127
+ llama_pos result = std::numeric_limits<llama_pos>::max();
2128
+
2129
+ for (uint32_t i = 0; i < size; ++i) {
2130
+ if (cells[i].has_seq_id(seq_id)) {
2131
+ result = std::min(result, cells[i].pos);
2132
+ }
2133
+ }
2134
+
2135
+ if (result == std::numeric_limits<llama_pos>::max()) {
2136
+ result = -1;
2137
+ }
2138
+
2139
+ return result;
2140
+ }
2141
+
1686
2142
  llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
1687
- llama_pos result = 0;
2143
+ llama_pos result = -1;
1688
2144
 
1689
2145
  for (uint32_t i = 0; i < size; ++i) {
1690
2146
  if (cells[i].has_seq_id(seq_id)) {
@@ -1707,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() {
1707
2163
  pending.ranges.clear();
1708
2164
  }
1709
2165
 
1710
- bool llama_kv_cache_recurrent::update(llama_context & lctx) {
1711
- GGML_UNUSED(lctx);
2166
+ bool llama_kv_cache_recurrent::update(llama_context & ctx) {
2167
+ GGML_UNUSED(ctx);
1712
2168
  return false;
1713
2169
  }
1714
2170
 
@@ -1769,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot(
1769
2225
  if (seq_id < 0 || (uint32_t) seq_id >= size) {
1770
2226
  // too big seq_id
1771
2227
  // TODO: would it be possible to resize the cache instead?
1772
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
2228
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
1773
2229
  return false;
1774
2230
  }
1775
2231
  if (j > 0) {
@@ -1912,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot(
1912
2368
  return n >= n_seqs;
1913
2369
  }
1914
2370
 
1915
- int32_t llama_kv_cache_recurrent::get_n_tokens() const {
1916
- int32_t result = 0;
1917
-
1918
- for (uint32_t i = 0; i < size; i++) {
1919
- result += cells[i].seq_id.size();
1920
- }
1921
-
1922
- return result;
1923
- }
1924
-
1925
- int32_t llama_kv_cache_recurrent::get_used_cells() const {
1926
- return used;
1927
- }
1928
-
1929
- llama_pos llama_kv_cache_recurrent::get_pos_max() const {
1930
- llama_pos pos_max = -1;
1931
- for (const auto & cell : cells) {
1932
- pos_max = std::max(pos_max, cell.pos);
1933
- }
1934
-
1935
- return pos_max;
1936
- }
1937
-
1938
2371
  bool llama_kv_cache_recurrent::get_can_shift() const {
1939
2372
  return false;
1940
2373
  }
@@ -2063,6 +2496,7 @@ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq
2063
2496
  io.read_to(&cell_count, sizeof(cell_count));
2064
2497
 
2065
2498
  bool res = true;
2499
+
2066
2500
  res = res && state_read_meta(io, cell_count, seq_id);
2067
2501
  res = res && state_read_data(io, cell_count);
2068
2502
 
@@ -2391,104 +2825,3 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
2391
2825
 
2392
2826
  return true;
2393
2827
  }
2394
-
2395
- //
2396
- // kv cache view
2397
- //
2398
-
2399
- llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
2400
- llama_kv_cache_view result = {
2401
- /*.n_cells = */ 0,
2402
- /*.n_seq_max = */ n_seq_max,
2403
- /*.token_count = */ 0,
2404
- /*.used_cells = */ kv.get_used_cells(),
2405
- /*.max_contiguous = */ 0,
2406
- /*.max_contiguous_idx = */ -1,
2407
- /*.cells = */ nullptr,
2408
- /*.cells_sequences = */ nullptr,
2409
- };
2410
-
2411
- return result;
2412
- }
2413
-
2414
- void llama_kv_cache_view_free(llama_kv_cache_view * view) {
2415
- if (view->cells != nullptr) {
2416
- free(view->cells);
2417
- view->cells = nullptr;
2418
- }
2419
- if (view->cells_sequences != nullptr) {
2420
- free(view->cells_sequences);
2421
- view->cells_sequences = nullptr;
2422
- }
2423
- }
2424
-
2425
- void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
2426
- // TODO: rework this in the future, for now quick hack
2427
- const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
2428
- if (kvu == nullptr) {
2429
- LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
2430
- return;
2431
- }
2432
-
2433
- if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
2434
- view->n_cells = int32_t(kvu->size);
2435
- void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
2436
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
2437
- view->cells = (llama_kv_cache_view_cell *)p;
2438
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
2439
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
2440
- view->cells_sequences = (llama_seq_id *)p;
2441
- }
2442
-
2443
- const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
2444
- llama_kv_cache_view_cell * c_curr = view->cells;
2445
- llama_seq_id * cs_curr = view->cells_sequences;
2446
- int32_t used_cells = 0;
2447
- int32_t token_count = 0;
2448
- int32_t curr_contig_idx = -1;
2449
- uint32_t max_contig = 0;
2450
- int32_t max_contig_idx = -1;
2451
-
2452
- for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
2453
- const size_t curr_size = kv_cells[i].seq_id.size();
2454
- token_count += curr_size;
2455
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
2456
-
2457
- if (curr_size > 0) {
2458
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
2459
- max_contig = i - curr_contig_idx;
2460
- max_contig_idx = curr_contig_idx;
2461
- }
2462
- curr_contig_idx = -1;
2463
- } else if (curr_contig_idx < 0) {
2464
- curr_contig_idx = i;
2465
- }
2466
-
2467
- int seq_idx = 0;
2468
- for (const llama_seq_id it : kv_cells[i].seq_id) {
2469
- if (seq_idx >= view->n_seq_max) {
2470
- break;
2471
- }
2472
- cs_curr[seq_idx] = it;
2473
- seq_idx++;
2474
- }
2475
- if (seq_idx != 0) {
2476
- used_cells++;
2477
- }
2478
- for (; seq_idx < view->n_seq_max; seq_idx++) {
2479
- cs_curr[seq_idx] = -1;
2480
- }
2481
- }
2482
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
2483
- max_contig_idx = curr_contig_idx;
2484
- max_contig = kv_cells.size() - curr_contig_idx;
2485
- }
2486
- view->max_contiguous = max_contig;
2487
- view->max_contiguous_idx = max_contig_idx;
2488
- view->token_count = token_count;
2489
- view->used_cells = used_cells;
2490
- if (uint32_t(used_cells) != kvu->used) {
2491
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
2492
- __func__, kvu->used, used_cells);
2493
- }
2494
- }