@fugood/llama.node 1.1.8 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/lib/binding.ts +9 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +15 -5
  8. package/src/LlamaCompletionWorker.cpp +12 -3
  9. package/src/LlamaCompletionWorker.h +3 -1
  10. package/src/LlamaContext.cpp +14 -1
  11. package/src/llama.cpp/common/arg.cpp +6 -4
  12. package/src/llama.cpp/common/chat.cpp +34 -3
  13. package/src/llama.cpp/common/common.cpp +0 -15
  14. package/src/llama.cpp/common/common.h +1 -2
  15. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  23. package/src/llama.cpp/include/llama.h +1 -110
  24. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  25. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  26. package/src/llama.cpp/src/llama-arch.h +1 -0
  27. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  28. package/src/llama.cpp/src/llama-chat.h +1 -0
  29. package/src/llama.cpp/src/llama-context.cpp +5 -192
  30. package/src/llama.cpp/src/llama-context.h +2 -7
  31. package/src/llama.cpp/src/llama-cparams.h +0 -1
  32. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  33. package/src/llama.cpp/src/llama-graph.h +36 -46
  34. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  35. package/src/llama.cpp/src/llama-hparams.h +6 -0
  36. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
  37. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
  38. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
  39. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
  40. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
  43. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  44. package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
  45. package/src/llama.cpp/src/llama-memory.h +11 -8
  46. package/src/llama.cpp/src/llama-model.cpp +396 -187
  47. package/src/llama.cpp/src/llama-model.h +1 -0
@@ -1,4 +1,4 @@
1
- #include "llama-kv-cache-unified.h"
1
+ #include "llama-kv-cache.h"
2
2
 
3
3
  #include "llama-impl.h"
4
4
  #include "llama-io.h"
@@ -13,36 +13,29 @@
13
13
  #include <stdexcept>
14
14
 
15
15
  //
16
- // llama_kv_cache_unified
16
+ // llama_kv_cache
17
17
  //
18
18
 
19
- llama_kv_cache_unified::llama_kv_cache_unified(
20
- const llama_model & model,
21
- layer_filter_cb && filter,
22
- ggml_type type_k,
23
- ggml_type type_v,
24
- bool v_trans,
25
- bool offload,
26
- bool unified,
27
- uint32_t kv_size,
28
- uint32_t n_seq_max,
29
- uint32_t n_pad,
30
- uint32_t n_swa,
31
- llama_swa_type swa_type) :
19
+ llama_kv_cache::llama_kv_cache(
20
+ const llama_model & model,
21
+ ggml_type type_k,
22
+ ggml_type type_v,
23
+ bool v_trans,
24
+ bool offload,
25
+ bool unified,
26
+ uint32_t kv_size,
27
+ uint32_t n_seq_max,
28
+ uint32_t n_pad,
29
+ uint32_t n_swa,
30
+ llama_swa_type swa_type,
31
+ const layer_filter_cb & filter,
32
+ const layer_reuse_cb & reuse) :
32
33
  model(model), hparams(model.hparams), v_trans(v_trans),
33
34
  n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
34
35
 
35
36
  GGML_ASSERT(kv_size % n_pad == 0);
36
37
 
37
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
38
- auto n_layer_cache = hparams.n_layer;
39
- if (model.arch == LLM_ARCH_GEMMA3N) {
40
- n_layer_cache = 20;
41
- }
42
- if (model.arch == LLM_ARCH_GLM4_MOE) {
43
- // GLM-4.5: Only process up to last layer, skip final NextN layer
44
- n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
45
- }
38
+ const uint32_t n_layer_kv = hparams.n_layer_kv();
46
39
 
47
40
  // create a context for each buffer type
48
41
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +43,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
50
43
  auto it = ctx_map.find(buft);
51
44
  if (it == ctx_map.end()) {
52
45
  ggml_init_params params = {
53
- /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()),
46
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
54
47
  /*.mem_buffer =*/ NULL,
55
48
  /*.no_alloc =*/ true,
56
49
  };
@@ -97,9 +90,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
97
90
  __func__, hparams.n_embd_v_gqa_max());
98
91
  }
99
92
 
100
- for (uint32_t il = 0; il < n_layer_cache; il++) {
93
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
94
+ if (!hparams.has_kv(il)) {
95
+ LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
96
+ continue;
97
+ }
98
+
101
99
  if (filter && !filter(il)) {
102
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
100
+ LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
103
101
  continue;
104
102
  }
105
103
 
@@ -147,23 +145,27 @@ llama_kv_cache_unified::llama_kv_cache_unified(
147
145
  layers.push_back({ il, k, v, k_stream, v_stream, });
148
146
  }
149
147
 
150
- // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
151
- if (model.arch == LLM_ARCH_GEMMA3N) {
152
- LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
148
+ if (reuse) {
149
+ LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
153
150
 
154
- for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
155
- if (filter && !filter(il)) {
156
- LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
151
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
152
+ const int32_t il_reuse = reuse(il);
153
+
154
+ if (il_reuse < 0) {
155
+ LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
157
156
  continue;
158
157
  }
159
158
 
160
- const bool is_swa = hparams.is_swa(il);
161
- const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
159
+ if (filter && !filter(il)) {
160
+ LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
161
+ continue;
162
+ }
162
163
 
163
164
  GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
165
+
164
166
  map_layer_ids[il] = map_layer_ids[il_reuse];
165
167
 
166
- LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
168
+ LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
167
169
  }
168
170
  }
169
171
 
@@ -209,7 +211,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
209
211
  }
210
212
  }
211
213
 
212
- void llama_kv_cache_unified::clear(bool data) {
214
+ void llama_kv_cache::clear(bool data) {
213
215
  for (uint32_t s = 0; s < n_stream; ++s) {
214
216
  v_cells[s].reset();
215
217
  v_heads[s] = 0;
@@ -222,7 +224,7 @@ void llama_kv_cache_unified::clear(bool data) {
222
224
  }
223
225
  }
224
226
 
225
- bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
227
+ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
226
228
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
227
229
 
228
230
  if (p0 < 0) {
@@ -285,7 +287,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
285
287
  return true;
286
288
  }
287
289
 
288
- void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
290
+ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
289
291
  GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
290
292
  GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
291
293
 
@@ -368,7 +370,7 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
368
370
  //}
369
371
  }
370
372
 
371
- void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
373
+ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
372
374
  GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
373
375
 
374
376
  auto & cells = v_cells[seq_to_stream[seq_id]];
@@ -390,7 +392,7 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
390
392
  }
391
393
  }
392
394
 
393
- void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
395
+ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
394
396
  GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
395
397
 
396
398
  auto & cells = v_cells[seq_to_stream[seq_id]];
@@ -434,7 +436,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
434
436
  head = new_head != cells.size() ? new_head : 0;
435
437
  }
436
438
 
437
- void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
439
+ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
438
440
  GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
439
441
 
440
442
  auto & cells = v_cells[seq_to_stream[seq_id]];
@@ -467,7 +469,7 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
467
469
  }
468
470
  }
469
471
 
470
- llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
472
+ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const {
471
473
  GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
472
474
 
473
475
  const auto & cells = v_cells[seq_to_stream[seq_id]];
@@ -475,7 +477,7 @@ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
475
477
  return cells.seq_pos_min(seq_id);
476
478
  }
477
479
 
478
- llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
480
+ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
479
481
  GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
480
482
 
481
483
  const auto & cells = v_cells[seq_to_stream[seq_id]];
@@ -483,7 +485,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
483
485
  return cells.seq_pos_max(seq_id);
484
486
  }
485
487
 
486
- llama_memory_context_ptr llama_kv_cache_unified::init_batch(
488
+ llama_memory_context_ptr llama_kv_cache::init_batch(
487
489
  llama_batch_allocr & balloc,
488
490
  uint32_t n_ubatch,
489
491
  bool embd_all) {
@@ -513,62 +515,34 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch(
513
515
  break;
514
516
  }
515
517
 
516
- return std::make_unique<llama_kv_cache_unified_context>(
518
+ return std::make_unique<llama_kv_cache_context>(
517
519
  this, std::move(sinfos), std::move(ubatches));
518
520
  } while (false);
519
521
 
520
- return std::make_unique<llama_kv_cache_unified_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
522
+ return std::make_unique<llama_kv_cache_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
521
523
  }
522
524
 
523
- llama_memory_context_ptr llama_kv_cache_unified::init_full() {
524
- return std::make_unique<llama_kv_cache_unified_context>(this);
525
+ llama_memory_context_ptr llama_kv_cache::init_full() {
526
+ return std::make_unique<llama_kv_cache_context>(this);
525
527
  }
526
528
 
527
- llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
528
- bool do_shift = get_has_shift();
529
-
530
- defrag_info dinfo;
531
-
532
- // see if we need to defrag
533
- if (n_stream == 1) {
534
- // note : for now do not consider defrag for n_stream > 1
535
- const auto & cells = v_cells[seq_to_stream[0]];
536
-
537
- bool do_defrag = optimize;
538
-
539
- const auto thold = lctx->get_cparams().defrag_thold;
540
-
541
- if (!do_defrag && thold > 0.0f) {
542
- const auto n_kv = cells.used_max_p1();
543
-
544
- // - do not defrag small contexts (i.e. < 2048 tokens)
545
- // - count the padding towards the number of used tokens
546
- const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
547
-
548
- if (fragmentation > thold) {
549
- LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
550
-
551
- do_defrag = true;
552
- }
553
- }
529
+ llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
530
+ GGML_UNUSED(optimize);
554
531
 
555
- if (do_defrag) {
556
- dinfo = defrag_prepare(lctx->graph_max_nodes());
557
- }
558
- }
532
+ bool do_shift = get_has_shift();
559
533
 
560
- return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo), std::move(sc_info));
534
+ return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
561
535
  }
562
536
 
563
- llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
564
- llama_kv_cache_unified::slot_info_vec_t res;
537
+ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
538
+ llama_kv_cache::slot_info_vec_t res;
565
539
 
566
540
  struct state_t {
567
541
  slot_info sinfo; // slot info for the ubatch
568
542
 
569
543
  std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
570
544
 
571
- std::vector<llama_kv_cells_unified> v_cells; // copy of the old cells, before placing the ubatch
545
+ std::vector<llama_kv_cells> v_cells; // copy of the old cells, before placing the ubatch
572
546
  };
573
547
 
574
548
  // remember the old state of the cells so we can restore it in the end
@@ -629,7 +603,7 @@ llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const st
629
603
  return res;
630
604
  }
631
605
 
632
- bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) {
606
+ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
633
607
  bool updated = false;
634
608
 
635
609
  auto * sched = lctx->get_sched();
@@ -699,57 +673,10 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
699
673
  }
700
674
  }
701
675
 
702
- if (!dinfo.empty()) {
703
- LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
704
-
705
- // note: for now do not consider defrag for n_stream > 1
706
- auto & cells = v_cells[seq_to_stream[0]];
707
- auto & head = v_heads[seq_to_stream[0]];
708
-
709
- // apply moves:
710
- {
711
- const auto n_kv = dinfo.ids.size();
712
-
713
- for (uint32_t i = 0; i < n_kv; ++i) {
714
- assert(dinfo.ids[i] <= n_kv);
715
-
716
- if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
717
- continue;
718
- }
719
-
720
- cells.mv(i, dinfo.ids[i]);
721
- }
722
-
723
- // reset the head so we can find the first free slot during the next ubatch
724
- head = 0;
725
- }
726
-
727
- ggml_backend_sched_reset(sched);
728
-
729
- auto * res = lctx->get_gf_res_reserve();
730
-
731
- res->reset();
732
-
733
- auto * gf = build_graph_defrag(res, lctx, dinfo);
734
- if (!ggml_backend_sched_alloc_graph(sched, gf)) {
735
- LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
736
- return updated;
737
- }
738
-
739
- res->set_inputs(nullptr);
740
-
741
- if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
742
- LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
743
- return updated;
744
- }
745
-
746
- updated = true;
747
- }
748
-
749
676
  return updated;
750
677
  }
751
678
 
752
- llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
679
+ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const {
753
680
 
754
681
  if (debug > 0) {
755
682
  for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
@@ -948,7 +875,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
948
875
  return res;
949
876
  }
950
877
 
951
- void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
878
+ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
952
879
  // keep track of the max sequence position that we would overwrite with this ubatch
953
880
  // for non-SWA cache, this would be always empty
954
881
  llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
@@ -1013,21 +940,21 @@ void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_u
1013
940
  }
1014
941
  }
1015
942
 
1016
- bool llama_kv_cache_unified::get_can_shift() const {
943
+ bool llama_kv_cache::get_can_shift() const {
1017
944
  return true;
1018
945
  }
1019
946
 
1020
- uint32_t llama_kv_cache_unified::get_size() const {
947
+ uint32_t llama_kv_cache::get_size() const {
1021
948
  const auto & cells = v_cells[seq_to_stream[0]];
1022
949
 
1023
950
  return cells.size();
1024
951
  }
1025
952
 
1026
- uint32_t llama_kv_cache_unified::get_n_stream() const {
953
+ uint32_t llama_kv_cache::get_n_stream() const {
1027
954
  return n_stream;
1028
955
  }
1029
956
 
1030
- bool llama_kv_cache_unified::get_has_shift() const {
957
+ bool llama_kv_cache::get_has_shift() const {
1031
958
  bool result = false;
1032
959
 
1033
960
  for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1037,7 +964,7 @@ bool llama_kv_cache_unified::get_has_shift() const {
1037
964
  return result;
1038
965
  }
1039
966
 
1040
- uint32_t llama_kv_cache_unified::get_n_kv() const {
967
+ uint32_t llama_kv_cache::get_n_kv() const {
1041
968
  uint32_t result = 0;
1042
969
 
1043
970
  for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1049,11 +976,11 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
1049
976
  return result;
1050
977
  }
1051
978
 
1052
- bool llama_kv_cache_unified::get_supports_set_rows() const {
979
+ bool llama_kv_cache::get_supports_set_rows() const {
1053
980
  return supports_set_rows;
1054
981
  }
1055
982
 
1056
- ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
983
+ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
1057
984
  const int32_t ikv = map_layer_ids.at(il);
1058
985
 
1059
986
  auto * k = layers[ikv].k;
@@ -1073,7 +1000,7 @@ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint
1073
1000
  ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
1074
1001
  }
1075
1002
 
1076
- ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
1003
+ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
1077
1004
  const int32_t ikv = map_layer_ids.at(il);
1078
1005
 
1079
1006
  auto * v = layers[ikv].v;
@@ -1105,7 +1032,7 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
1105
1032
  ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
1106
1033
  }
1107
1034
 
1108
- ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1035
+ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1109
1036
  const int32_t ikv = map_layer_ids.at(il);
1110
1037
 
1111
1038
  auto * k = layers[ikv].k;
@@ -1135,7 +1062,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_
1135
1062
  return ggml_cpy(ctx, k_cur, k_view);
1136
1063
  }
1137
1064
 
1138
- ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1065
+ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1139
1066
  const int32_t ikv = map_layer_ids.at(il);
1140
1067
 
1141
1068
  auto * v = layers[ikv].v;
@@ -1189,7 +1116,7 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
1189
1116
  return ggml_cpy(ctx, v_cur, v_view);
1190
1117
  }
1191
1118
 
1192
- ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1119
+ ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1193
1120
  const uint32_t n_tokens = ubatch.n_tokens;
1194
1121
 
1195
1122
  ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
@@ -1199,7 +1126,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, con
1199
1126
  return k_idxs;
1200
1127
  }
1201
1128
 
1202
- ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1129
+ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1203
1130
  const uint32_t n_tokens = ubatch.n_tokens;
1204
1131
 
1205
1132
  ggml_tensor * v_idxs;
@@ -1215,7 +1142,7 @@ ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, con
1215
1142
  return v_idxs;
1216
1143
  }
1217
1144
 
1218
- void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1145
+ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1219
1146
  if (!supports_set_rows) {
1220
1147
  return;
1221
1148
  }
@@ -1235,7 +1162,7 @@ void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_uba
1235
1162
  }
1236
1163
  }
1237
1164
 
1238
- void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1165
+ void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1239
1166
  if (!supports_set_rows) {
1240
1167
  return;
1241
1168
  }
@@ -1272,7 +1199,7 @@ void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_uba
1272
1199
  }
1273
1200
  }
1274
1201
 
1275
- void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
1202
+ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
1276
1203
  GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1277
1204
 
1278
1205
  int32_t * data = (int32_t *) dst->data;
@@ -1286,7 +1213,7 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
1286
1213
  }
1287
1214
  }
1288
1215
 
1289
- void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1216
+ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1290
1217
  const uint32_t n_tokens = ubatch->n_tokens;
1291
1218
 
1292
1219
  GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
@@ -1358,7 +1285,7 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
1358
1285
  }
1359
1286
  }
1360
1287
 
1361
- void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
1288
+ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
1362
1289
  const int64_t n_tokens = ubatch->n_tokens;
1363
1290
 
1364
1291
  GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
@@ -1383,7 +1310,7 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
1383
1310
  }
1384
1311
  }
1385
1312
 
1386
- size_t llama_kv_cache_unified::total_size() const {
1313
+ size_t llama_kv_cache::total_size() const {
1387
1314
  size_t size = 0;
1388
1315
 
1389
1316
  for (const auto & buf : bufs) {
@@ -1393,7 +1320,7 @@ size_t llama_kv_cache_unified::total_size() const {
1393
1320
  return size;
1394
1321
  }
1395
1322
 
1396
- size_t llama_kv_cache_unified::size_k_bytes() const {
1323
+ size_t llama_kv_cache::size_k_bytes() const {
1397
1324
  size_t size_k_bytes = 0;
1398
1325
 
1399
1326
  for (const auto & layer : layers) {
@@ -1403,7 +1330,7 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
1403
1330
  return size_k_bytes;
1404
1331
  }
1405
1332
 
1406
- size_t llama_kv_cache_unified::size_v_bytes() const {
1333
+ size_t llama_kv_cache::size_v_bytes() const {
1407
1334
  size_t size_v_bytes = 0;
1408
1335
 
1409
1336
  for (const auto & layer : layers) {
@@ -1413,7 +1340,7 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
1413
1340
  return size_v_bytes;
1414
1341
  }
1415
1342
 
1416
- ggml_tensor * llama_kv_cache_unified::build_rope_shift(
1343
+ ggml_tensor * llama_kv_cache::build_rope_shift(
1417
1344
  const llama_cparams & cparams,
1418
1345
  ggml_context * ctx,
1419
1346
  ggml_tensor * cur,
@@ -1465,14 +1392,14 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
1465
1392
 
1466
1393
  class llm_graph_input_k_shift : public llm_graph_input_i {
1467
1394
  public:
1468
- llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
1395
+ llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {}
1469
1396
  virtual ~llm_graph_input_k_shift() = default;
1470
1397
 
1471
1398
  void set_input(const llama_ubatch * ubatch) override;
1472
1399
 
1473
1400
  ggml_tensor * k_shift; // I32 [kv_size*n_stream]
1474
1401
 
1475
- const llama_kv_cache_unified * kv_self;
1402
+ const llama_kv_cache * kv_self;
1476
1403
  };
1477
1404
 
1478
1405
  void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
@@ -1483,7 +1410,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
1483
1410
  }
1484
1411
  }
1485
1412
 
1486
- ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
1413
+ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
1487
1414
  auto * ctx = res->get_ctx();
1488
1415
  auto * gf = res->get_gf();
1489
1416
 
@@ -1525,284 +1452,7 @@ ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res,
1525
1452
  return gf;
1526
1453
  }
1527
1454
 
1528
- ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
1529
- llm_graph_result * res,
1530
- llama_context * lctx,
1531
- const defrag_info & dinfo) const {
1532
- auto * ctx = res->get_ctx();
1533
- auto * gf = res->get_gf();
1534
-
1535
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
1536
-
1537
- const auto & cells = v_cells[0];
1538
-
1539
- const auto & ids = dinfo.ids;
1540
-
1541
- const auto & cparams = lctx->get_cparams();
1542
-
1543
- #if 0
1544
- // CPU defrag
1545
- //
1546
- // TODO: optimizations are possible:
1547
- // - multiple threads
1548
- // - avoid copying to the host memory when already there
1549
- //
1550
- // likely not worth the effort, as we have ggml_graph based defrag
1551
- //
1552
-
1553
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1554
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1555
-
1556
- const uint32_t kv_size = size;
1557
-
1558
- std::vector<uint8_t> buf_k;
1559
- std::vector<uint8_t> buf_v;
1560
-
1561
- for (uint32_t il = 0; il < n_layer; ++il) {
1562
- const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1563
- const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
1564
-
1565
- const size_t v_size_el = ggml_type_size(v_l[il]->type);
1566
- const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
1567
-
1568
- buf_k.resize(k_size);
1569
- buf_v.resize(v_size);
1570
-
1571
- ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
1572
- ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
1573
-
1574
- // batch move [i, i+nm) to [id, id+nm)
1575
- // note: cells can move only to a lower index
1576
- for (uint32_t i = 0; i < n_kv; ++i) {
1577
- const uint32_t id = ids[i];
1578
-
1579
- if (i == id || id == n_kv) {
1580
- continue;
1581
- }
1582
-
1583
- uint32_t nm = 1;
1584
-
1585
- while (i + nm < n_kv && ids[i + nm] == id + nm) {
1586
- nm++;
1587
- }
1588
-
1589
- // move keys
1590
- {
1591
- const int64_t os = i*k_size_row;
1592
- const int64_t od = id*k_size_row;
1593
-
1594
- memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
1595
- }
1596
-
1597
- // move values (note: they are transposed)
1598
- {
1599
- const int64_t os = i;
1600
- const int64_t od = id;
1601
-
1602
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1603
- memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
1604
- }
1605
- }
1606
-
1607
- i += nm - 1;
1608
- }
1609
-
1610
- ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
1611
- ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
1612
- }
1613
- #else
1614
- for (uint32_t i = 0; i < ids.size(); ++i) {
1615
- const uint32_t id = ids[i];
1616
-
1617
- if (i == id || id == ids.size()) {
1618
- continue;
1619
- }
1620
-
1621
- uint32_t nm = 1;
1622
-
1623
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
1624
- nm++;
1625
- }
1626
-
1627
- for (const auto & layer : layers) {
1628
- const uint32_t il = layer.il;
1629
-
1630
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1631
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1632
-
1633
- ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
1634
- n_embd_k_gqa, nm,
1635
- ggml_row_size(layer.k->type, n_embd_k_gqa),
1636
- ggml_row_size(layer.k->type, n_embd_k_gqa*i));
1637
-
1638
- ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
1639
- n_embd_k_gqa, nm,
1640
- ggml_row_size(layer.k->type, n_embd_k_gqa),
1641
- ggml_row_size(layer.k->type, n_embd_k_gqa*id));
1642
-
1643
- ggml_tensor * view_v_src;
1644
- ggml_tensor * view_v_dst;
1645
-
1646
- if (cparams.flash_attn) {
1647
- // NOTE: the V cache is not transposed when using flash attention
1648
- view_v_src = ggml_view_2d(ctx, layer.v,
1649
- n_embd_v_gqa, nm,
1650
- ggml_row_size(layer.v->type, n_embd_v_gqa),
1651
- ggml_row_size(layer.v->type, n_embd_v_gqa*i));
1652
-
1653
- view_v_dst = ggml_view_2d(ctx, layer.v,
1654
- n_embd_v_gqa, nm,
1655
- ggml_row_size(layer.v->type, n_embd_v_gqa),
1656
- ggml_row_size(layer.v->type, n_embd_v_gqa*id));
1657
- } else {
1658
- view_v_src = ggml_view_2d(ctx, layer.v,
1659
- nm, n_embd_v_gqa,
1660
- ggml_row_size(layer.v->type, cells.size()),
1661
- ggml_row_size(layer.v->type, i));
1662
-
1663
- view_v_dst = ggml_view_2d(ctx, layer.v,
1664
- nm, n_embd_v_gqa,
1665
- ggml_row_size(layer.v->type, cells.size()),
1666
- ggml_row_size(layer.v->type, id));
1667
- }
1668
-
1669
- ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
1670
- ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
1671
- }
1672
-
1673
- i += nm - 1;
1674
- }
1675
-
1676
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
1677
- #endif
1678
-
1679
- return gf;
1680
- }
1681
-
1682
- llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
1683
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
1684
-
1685
- const auto & cells = v_cells[0];
1686
-
1687
- const uint32_t n_layer = layers.size();
1688
-
1689
- const uint32_t n_kv = cells.used_max_p1();
1690
- const uint32_t n_used = cells.get_used();
1691
-
1692
- assert(n_used <= n_kv);
1693
-
1694
- //const int64_t t_start = ggml_time_us();
1695
-
1696
- // number of cells moved
1697
- uint32_t n_moves = 0;
1698
-
1699
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
1700
- // - source view, destination view, copy operation
1701
- // - x2 for keys and values
1702
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
1703
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
1704
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
1705
-
1706
- // determine which KV cells to move where
1707
- defrag_info res;
1708
- auto & ids = res.ids;
1709
-
1710
- ids.resize(n_kv, n_kv);
1711
-
1712
- for (uint32_t i0 = 0; i0 < n_used; ++i0) {
1713
- if (!cells.is_empty(i0)) {
1714
- ids[i0] = i0;
1715
-
1716
- continue;
1717
- }
1718
-
1719
- // found a hole - fill it with data from the end of the cache
1720
-
1721
- uint32_t nh = 1;
1722
-
1723
- // determine the size of the hole
1724
- while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
1725
- nh++;
1726
- }
1727
-
1728
- uint32_t nf = 0;
1729
- uint32_t is = n_kv - 1;
1730
-
1731
- // starting from the end, find nh non-empty cells
1732
- for (; is > i0; --is) {
1733
- if (cells.is_empty(is) || ids[is] != n_kv) {
1734
- continue;
1735
- }
1736
-
1737
- // non-empty cell which is not yet moved
1738
- nf++;
1739
-
1740
- if (nf == nh) {
1741
- break;
1742
- }
1743
- }
1744
-
1745
- // this can only happen if `n_used` is not accurate, which would be a bug
1746
- GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
1747
-
1748
- nf = 0;
1749
-
1750
- uint32_t i1 = is;
1751
-
1752
- // are we moving a continuous block of memory?
1753
- bool cont = false;
1754
-
1755
- // should we stop searching for the next move?
1756
- bool stop = false;
1757
-
1758
- // go back and move the nf cells to the hole
1759
- for (; i1 < n_kv; ++i1) {
1760
- if (cells.is_empty(i1) || ids[i1] != n_kv) {
1761
- if (n_moves == max_moves) {
1762
- stop = true;
1763
- break;
1764
- }
1765
-
1766
- cont = false;
1767
- continue;
1768
- }
1769
-
1770
- // this cell goes to (i0 + nf)
1771
- ids[i1] = i0 + nf;
1772
-
1773
- if (!cont) {
1774
- n_moves++;
1775
- cont = true;
1776
- }
1777
-
1778
- nf++;
1779
-
1780
- if (nf == nh) {
1781
- break;
1782
- }
1783
- }
1784
-
1785
- if (stop || n_moves == max_moves) {
1786
- break;
1787
- }
1788
-
1789
- //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
1790
-
1791
- i0 += nh - 1;
1792
- }
1793
-
1794
- if (n_moves == 0) {
1795
- return {};
1796
- }
1797
-
1798
- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
1799
-
1800
- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
1801
-
1802
- return res;
1803
- }
1804
-
1805
- bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1455
+ bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1806
1456
  assert(p0 >= 0 && p1 >= 0);
1807
1457
 
1808
1458
  switch (swa_type) {
@@ -1828,7 +1478,7 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1828
1478
  return false;
1829
1479
  }
1830
1480
 
1831
- void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1481
+ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1832
1482
  GGML_UNUSED(flags);
1833
1483
 
1834
1484
  io.write(&n_stream, sizeof(n_stream));
@@ -1881,7 +1531,7 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1881
1531
  }
1882
1532
  }
1883
1533
 
1884
- void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1534
+ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1885
1535
  GGML_UNUSED(flags);
1886
1536
 
1887
1537
  GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
@@ -1917,7 +1567,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
1917
1567
  }
1918
1568
  }
1919
1569
 
1920
- void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
1570
+ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
1921
1571
  const auto & cells = v_cells[cr.strm];
1922
1572
 
1923
1573
  for (const auto & range : cr.data) {
@@ -1945,7 +1595,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const cell_
1945
1595
  }
1946
1596
  }
1947
1597
 
1948
- void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
1598
+ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
1949
1599
  const auto & cells = v_cells[cr.strm];
1950
1600
 
1951
1601
  const uint32_t v_trans = this->v_trans ? 1 : 0;
@@ -2040,7 +1690,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const cell_
2040
1690
  }
2041
1691
  }
2042
1692
 
2043
- bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
1693
+ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
2044
1694
  auto & cells = v_cells[strm];
2045
1695
  auto & head = v_heads[strm];
2046
1696
 
@@ -2137,7 +1787,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t strm
2137
1787
  return true;
2138
1788
  }
2139
1789
 
2140
- bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
1790
+ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
2141
1791
  auto & cells = v_cells[strm];
2142
1792
  auto & head = v_heads[strm];
2143
1793
 
@@ -2274,13 +1924,13 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t strm
2274
1924
  }
2275
1925
 
2276
1926
  //
2277
- // llama_kv_cache_unified_context
1927
+ // llama_kv_cache_context
2278
1928
  //
2279
1929
 
2280
- llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {}
1930
+ llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {}
2281
1931
 
2282
- llama_kv_cache_unified_context::llama_kv_cache_unified_context(
2283
- llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
1932
+ llama_kv_cache_context::llama_kv_cache_context(
1933
+ llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
2284
1934
  n_kv = kv->get_size();
2285
1935
 
2286
1936
  const uint32_t n_stream = kv->get_n_stream();
@@ -2296,26 +1946,25 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
2296
1946
  }
2297
1947
  }
2298
1948
 
2299
- llama_kv_cache_unified_context::llama_kv_cache_unified_context(
2300
- llama_kv_cache_unified * kv,
1949
+ llama_kv_cache_context::llama_kv_cache_context(
1950
+ llama_kv_cache * kv,
2301
1951
  llama_context * lctx,
2302
1952
  bool do_shift,
2303
- defrag_info dinfo,
2304
- stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) {
2305
- if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
1953
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
1954
+ if (!do_shift && this->sc_info.empty()) {
2306
1955
  status = LLAMA_MEMORY_STATUS_NO_UPDATE;
2307
1956
  }
2308
1957
  }
2309
1958
 
2310
- llama_kv_cache_unified_context::llama_kv_cache_unified_context(
2311
- llama_kv_cache_unified * kv,
2312
- llama_kv_cache_unified::slot_info_vec_t sinfos,
1959
+ llama_kv_cache_context::llama_kv_cache_context(
1960
+ llama_kv_cache * kv,
1961
+ llama_kv_cache::slot_info_vec_t sinfos,
2313
1962
  std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
2314
1963
  }
2315
1964
 
2316
- llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
1965
+ llama_kv_cache_context::~llama_kv_cache_context() = default;
2317
1966
 
2318
- bool llama_kv_cache_unified_context::next() {
1967
+ bool llama_kv_cache_context::next() {
2319
1968
  assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
2320
1969
 
2321
1970
  if (++i_cur >= ubatches.size()) {
@@ -2325,12 +1974,12 @@ bool llama_kv_cache_unified_context::next() {
2325
1974
  return true;
2326
1975
  }
2327
1976
 
2328
- bool llama_kv_cache_unified_context::apply() {
1977
+ bool llama_kv_cache_context::apply() {
2329
1978
  assert(!llama_memory_status_is_fail(status));
2330
1979
 
2331
1980
  // no ubatches -> this is a KV cache update
2332
1981
  if (ubatches.empty()) {
2333
- kv->update(lctx, do_shift, dinfo, sc_info);
1982
+ kv->update(lctx, do_shift, sc_info);
2334
1983
 
2335
1984
  return true;
2336
1985
  }
@@ -2342,69 +1991,69 @@ bool llama_kv_cache_unified_context::apply() {
2342
1991
  return true;
2343
1992
  }
2344
1993
 
2345
- llama_memory_status llama_kv_cache_unified_context::get_status() const {
1994
+ llama_memory_status llama_kv_cache_context::get_status() const {
2346
1995
  return status;
2347
1996
  }
2348
1997
 
2349
- const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const {
1998
+ const llama_ubatch & llama_kv_cache_context::get_ubatch() const {
2350
1999
  assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
2351
2000
 
2352
2001
  return ubatches[i_cur];
2353
2002
  }
2354
2003
 
2355
- uint32_t llama_kv_cache_unified_context::get_n_kv() const {
2004
+ uint32_t llama_kv_cache_context::get_n_kv() const {
2356
2005
  return n_kv;
2357
2006
  }
2358
2007
 
2359
- bool llama_kv_cache_unified_context::get_supports_set_rows() const {
2008
+ bool llama_kv_cache_context::get_supports_set_rows() const {
2360
2009
  return kv->get_supports_set_rows();
2361
2010
  }
2362
2011
 
2363
- ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
2012
+ ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
2364
2013
  return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
2365
2014
  }
2366
2015
 
2367
- ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const {
2016
+ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
2368
2017
  return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
2369
2018
  }
2370
2019
 
2371
- ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
2020
+ ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
2372
2021
  return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
2373
2022
  }
2374
2023
 
2375
- ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
2024
+ ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
2376
2025
  return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
2377
2026
  }
2378
2027
 
2379
- ggml_tensor * llama_kv_cache_unified_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2028
+ ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2380
2029
  return kv->build_input_k_idxs(ctx, ubatch);
2381
2030
  }
2382
2031
 
2383
- ggml_tensor * llama_kv_cache_unified_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2032
+ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2384
2033
  return kv->build_input_v_idxs(ctx, ubatch);
2385
2034
  }
2386
2035
 
2387
- void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const {
2036
+ void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
2388
2037
  kv->set_input_k_shift(dst);
2389
2038
  }
2390
2039
 
2391
- void llama_kv_cache_unified_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2040
+ void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2392
2041
  kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
2393
2042
  }
2394
2043
 
2395
- void llama_kv_cache_unified_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2044
+ void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2396
2045
  kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
2397
2046
  }
2398
2047
 
2399
- void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
2048
+ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
2400
2049
  kv->set_input_kq_mask(dst, ubatch, causal_attn);
2401
2050
  }
2402
2051
 
2403
- void llama_kv_cache_unified_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2052
+ void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2404
2053
  kv->set_input_pos_bucket(dst, ubatch);
2405
2054
  }
2406
2055
 
2407
- uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
2056
+ uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
2408
2057
  // the FA kernels require padding to avoid extra runtime boundary checks
2409
2058
  return cparams.flash_attn ? 256u : 32u;
2410
2059
  }