cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
@@ -1,718 +1,1380 @@
1
- #include "llama-kv-cache.h"
2
-
3
- #include "llama-impl.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
-
8
- #include <algorithm>
9
- #include <limits>
10
- #include <map>
11
-
12
- static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
13
-
14
- uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
15
- // the FA kernels require padding to avoid extra runtime boundary checks
16
- return cparams.flash_attn ? 256u : 32u;
17
- }
18
-
19
- bool llama_kv_cache_init(
20
- struct llama_kv_cache & cache,
21
- const llama_model & model,
22
- const llama_cparams & cparams,
23
- lm_ggml_type type_k,
24
- lm_ggml_type type_v,
25
- uint32_t kv_size,
26
- bool offload) {
27
- const struct llama_hparams & hparams = model.hparams;
28
-
29
- const int32_t n_layer = hparams.n_layer;
30
-
31
- cache.has_shift = false;
32
-
33
- cache.recurrent = llama_model_is_recurrent(&model);
34
- cache.v_trans = !cache.recurrent && !cparams.flash_attn;
35
- cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
36
-
37
- LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
38
- __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, cache.can_shift);
39
-
40
- cache.head = 0;
41
- cache.size = kv_size;
42
- cache.used = 0;
43
-
44
- cache.type_k = type_k;
45
- cache.type_v = type_v;
46
-
47
- cache.cells.clear();
48
- cache.cells.resize(kv_size);
49
-
50
- // create a context for each buffer type
51
- std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
52
- auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
53
- auto it = ctx_map.find(buft);
54
- if (it == ctx_map.end()) {
55
- struct lm_ggml_init_params params = {
56
- /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
57
- /*.mem_buffer =*/ NULL,
58
- /*.no_alloc =*/ true,
59
- };
60
- lm_ggml_context * ctx = lm_ggml_init(params);
61
- if (!ctx) {
62
- return nullptr;
63
- }
64
- ctx_map[buft] = ctx;
65
- cache.ctxs.emplace_back(ctx);
66
- return ctx;
67
- }
68
- return it->second;
69
- };
70
-
71
- cache.k_l.reserve(n_layer);
72
- cache.v_l.reserve(n_layer);
73
-
74
- for (int i = 0; i < n_layer; i++) {
75
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
76
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
77
-
78
- LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
79
-
80
- lm_ggml_backend_buffer_type_t buft;
81
- if (offload) {
82
- auto * dev = model.dev_layer(i);
83
- buft = lm_ggml_backend_dev_buffer_type(dev);
84
- } else {
85
- buft = lm_ggml_backend_cpu_buffer_type();
86
- }
87
- lm_ggml_context * ctx = ctx_for_buft(buft);
88
-
89
- if (!ctx) {
90
- LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
91
- return false;
92
- }
93
-
94
- lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
95
- lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
96
- lm_ggml_format_name(k, "cache_k_l%d", i);
97
- lm_ggml_format_name(v, "cache_v_l%d", i);
98
- cache.k_l.push_back(k);
99
- cache.v_l.push_back(v);
100
- }
101
-
102
- // allocate tensors and initialize the buffers to avoid NaNs in the padding
103
- for (auto it : ctx_map) {
104
- auto * buft = it.first;
105
- auto * ctx = it.second;
106
-
107
- lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
108
- if (!buf) {
109
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
110
- return false;
111
- }
112
- lm_ggml_backend_buffer_clear(buf, 0);
113
- LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
114
- cache.bufs.emplace_back(buf);
115
- }
116
-
117
- return true;
118
- }
119
-
120
- struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
121
- struct llama_kv_cache & cache,
122
- const struct llama_ubatch & ubatch) {
123
- const uint32_t n_tokens = ubatch.n_tokens;
124
- const uint32_t n_seqs = ubatch.n_seqs;
125
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
126
-
127
- if (cache.recurrent) {
128
- // For recurrent state architectures (like Mamba or RWKV),
129
- // each cache cell can store the state for a whole sequence.
130
- // A slot should be always be contiguous.
131
-
132
- // can only process batches with an equal number of new tokens in each sequence
133
- LM_GGML_ASSERT(ubatch.equal_seqs);
134
-
135
- int32_t min = cache.size - 1;
136
- int32_t max = 0;
137
-
138
- // everything should fit if all seq_ids are smaller than the max
139
- for (uint32_t s = 0; s < n_seqs; ++s) {
140
- const uint32_t n_seq_id = ubatch.n_seq_id[s];
141
- for (uint32_t j = 0; j < n_seq_id; ++j) {
142
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
143
-
144
- if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
145
- // too big seq_id
146
- // TODO: would it be possible to resize the cache instead?
147
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
148
- return llama_kv_cache_slot_info_failed;
149
- }
150
- if (j > 0) {
151
- llama_kv_cell & seq = cache.cells[seq_id];
152
- if (seq.tail >= 0) {
153
- llama_kv_cell & cell = cache.cells[seq.tail];
154
- // clear cells from seq_ids that become shared
155
- // (should not normally happen, but let's handle it anyway)
156
- cell.seq_id.erase(seq_id);
157
- seq.tail = -1;
158
- if (cell.seq_id.empty()) {
159
- cell.pos = -1;
160
- cell.src = -1;
161
- cache.used -= 1;
162
- }
163
- }
164
- }
165
- }
166
- }
167
-
168
- #ifndef NDEBUG
169
- {
170
- std::vector<int32_t> tails_verif;
171
- tails_verif.assign(cache.size, -1);
172
- for (uint32_t i = 0; i < cache.size; ++i) {
173
- llama_kv_cell & cell = cache.cells[i];
174
- for (llama_seq_id seq_id : cell.seq_id) {
175
- if (tails_verif[seq_id] != -1) {
176
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
177
- }
178
- tails_verif[seq_id] = i;
179
- }
180
- }
181
- for (uint32_t i = 0; i < cache.size; ++i) {
182
- if (tails_verif[i] != cache.cells[i].tail) {
183
- LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
184
- }
185
- }
186
- }
187
- #endif
188
-
189
- // find next empty cell
190
- uint32_t next_empty_cell = cache.head;
191
-
192
- for (uint32_t i = 0; i < cache.size; ++i) {
193
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
194
- llama_kv_cell & cell = cache.cells[next_empty_cell];
195
- if (cell.is_empty()) { break; }
196
- next_empty_cell += 1;
197
- }
198
-
199
- // find usable cell range
200
- for (uint32_t s = 0; s < n_seqs; ++s) {
201
- const llama_seq_id seq_id = ubatch.seq_id[s][0];
202
- llama_kv_cell & seq_meta = cache.cells[seq_id];
203
- bool has_cell = false;
204
- if (seq_meta.tail >= 0) {
205
- llama_kv_cell & cell = cache.cells[seq_meta.tail];
206
- LM_GGML_ASSERT(cell.has_seq_id(seq_id));
207
- // does this seq_id "own" the cell?
208
- if (cell.seq_id.size() == 1) { has_cell = true; }
209
- }
210
- if (!has_cell) {
211
- llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
212
- LM_GGML_ASSERT(empty_cell.is_empty());
213
- // copy old tail into the empty cell
214
- if (seq_meta.tail >= 0) {
215
- llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
216
- empty_cell.pos = orig_cell.pos;
217
- empty_cell.src = orig_cell.src;
218
- orig_cell.seq_id.erase(seq_id);
219
- empty_cell.seq_id.insert(seq_id); // will be overwritten
220
- }
221
- seq_meta.tail = next_empty_cell;
222
- // find next empty cell
223
- if (s + 1 < n_seqs) {
224
- next_empty_cell += 1;
225
- for (uint32_t i = 0; i < cache.size; ++i) {
226
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
227
- llama_kv_cell & cell = cache.cells[next_empty_cell];
228
- if (cell.is_empty()) { break; }
229
- next_empty_cell += 1;
230
- }
231
- }
232
- }
233
- if (min > seq_meta.tail) { min = seq_meta.tail; }
234
- if (max < seq_meta.tail) { max = seq_meta.tail; }
235
- }
236
-
237
- // gather and re-order
238
- for (uint32_t s = 0; s < n_seqs; ++s) {
239
- int32_t dst_id = s + min;
240
- int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
241
- if (dst_id != src_id) {
242
- llama_kv_cell & dst_cell = cache.cells[dst_id];
243
- llama_kv_cell & src_cell = cache.cells[src_id];
244
-
245
- std::swap(dst_cell.pos, src_cell.pos);
246
- std::swap(dst_cell.src, src_cell.src);
247
- std::swap(dst_cell.seq_id, src_cell.seq_id);
248
-
249
- // swap tails (assuming they NEVER overlap)
250
- for (const llama_seq_id seq_id : src_cell.seq_id) {
251
- cache.cells[seq_id].tail = src_id;
252
- }
253
- for (const llama_seq_id seq_id : dst_cell.seq_id) {
254
- cache.cells[seq_id].tail = dst_id;
255
- }
256
- }
257
- }
258
-
259
- // update the pos of the used seqs
260
- for (uint32_t s = 0; s < n_seqs; ++s) {
261
- const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
262
- int32_t cell_id = s + min;
263
- llama_kv_cell & cell = cache.cells[cell_id];
264
-
265
- if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
266
- // What should happen when the pos backtracks or skips a value?
267
- // Clearing the state mid-batch would require special-casing which isn't done.
268
- LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
269
- __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
270
- }
271
- cell.pos = last_pos;
272
- cell.seq_id.clear();
273
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
274
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
275
- cell.seq_id.insert(seq_id);
276
- cache.cells[seq_id].tail = cell_id;
277
- }
278
- }
279
-
280
- // allow getting the range of used cells, from head to head + n
281
- cache.head = min;
282
- cache.n = max - min + 1;
283
- cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
284
- [](const llama_kv_cell& cell){ return !cell.is_empty(); });
285
-
286
- // sanity check
287
- return llama_kv_cache_slot_info(cache.n >= n_seqs);
288
- }
289
- // otherwise, one cell per token.
290
-
291
- if (n_tokens > cache.size) {
292
- LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
293
- return llama_kv_cache_slot_info_failed;
294
- }
295
-
296
- uint32_t n_tested = 0;
297
-
298
- while (true) {
299
- if (cache.head + n_tokens > cache.size) {
300
- n_tested += cache.size - cache.head;
301
- cache.head = 0;
302
- continue;
303
- }
304
-
305
- bool found = true;
306
- for (uint32_t i = 0; i < n_tokens; i++) {
307
- if (cache.cells[cache.head + i].pos >= 0) {
308
- found = false;
309
- cache.head += i + 1;
310
- n_tested += i + 1;
311
- break;
312
- }
313
- }
314
-
315
- if (found) {
316
- break;
317
- }
318
-
319
- if (n_tested >= cache.size) {
320
- //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
321
- return llama_kv_cache_slot_info_failed;
322
- }
323
- }
324
-
325
- for (uint32_t s = 0; s < n_seqs; s++) {
326
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
327
- uint32_t k = s*n_seq_tokens + i;
328
- cache.cells[cache.head + k].pos = ubatch.pos[k];
329
-
330
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
331
- cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
332
- }
333
- }
334
- }
335
-
336
- cache.used += n_tokens;
337
-
338
- return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
339
- }
340
-
341
- uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
342
- for (uint32_t i = cache.size; i > 0; --i) {
343
- const llama_kv_cell & cell = cache.cells[i - 1];
344
-
345
- if (cell.pos >= 0 && !cell.is_empty()) {
346
- return i;
347
- }
348
- }
349
-
350
- return 0;
351
- }
352
-
353
- void llama_kv_cache_clear(struct llama_kv_cache & cache) {
354
- for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
355
- cache.cells[i].pos = -1;
356
- cache.cells[i].seq_id.clear();
357
- cache.cells[i].src = -1;
358
- cache.cells[i].tail = -1;
359
- }
360
- cache.head = 0;
361
- cache.used = 0;
362
-
363
- for (auto & buf : cache.bufs) {
364
- lm_ggml_backend_buffer_clear(buf.get(), 0);
365
- }
366
- }
367
-
368
- bool llama_kv_cache_seq_rm(
369
- struct llama_kv_cache & cache,
370
- llama_seq_id seq_id,
371
- llama_pos p0,
372
- llama_pos p1) {
373
- uint32_t new_head = cache.size;
374
-
375
- if (p0 < 0) p0 = 0;
376
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
377
-
378
- // models like Mamba or RWKV can't have a state partially erased
379
- if (cache.recurrent) {
380
- if (seq_id >= (int64_t) cache.size) {
381
- // could be fatal
382
- return false;
383
- }
384
- if (0 <= seq_id) {
385
- int32_t & tail_id = cache.cells[seq_id].tail;
386
- if (tail_id >= 0) {
387
- const llama_kv_cell & cell = cache.cells[tail_id];
388
- // partial intersection is invalid
389
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
390
- return false;
391
- }
392
- // invalidate tails which will be cleared
393
- if (p0 <= cell.pos && cell.pos < p1) {
394
- tail_id = -1;
395
- }
396
- }
397
- } else {
398
- // seq_id is negative, then the range should include everything or nothing
399
- if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
400
- return false;
401
- }
402
- }
403
- }
404
-
405
- for (uint32_t i = 0; i < cache.size; ++i) {
406
- if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
407
- if (seq_id < 0) {
408
- cache.cells[i].seq_id.clear();
409
- } else if (cache.cells[i].has_seq_id(seq_id)) {
410
- cache.cells[i].seq_id.erase(seq_id);
411
- } else {
412
- continue;
413
- }
414
- if (cache.cells[i].is_empty()) {
415
- // keep count of the number of used cells
416
- if (cache.cells[i].pos >= 0) cache.used--;
417
-
418
- cache.cells[i].pos = -1;
419
- cache.cells[i].src = -1;
420
- if (new_head == cache.size) new_head = i;
421
- }
422
- }
423
- }
424
-
425
- // If we freed up a slot, set head to it so searching can start there.
426
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
427
-
428
- return true;
429
- }
430
-
431
- void llama_kv_cache_seq_cp(
432
- struct llama_kv_cache & cache,
433
- llama_seq_id seq_id_src,
434
- llama_seq_id seq_id_dst,
435
- llama_pos p0,
436
- llama_pos p1) {
437
- if (p0 < 0) p0 = 0;
438
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
439
-
440
- if (cache.recurrent) {
441
- if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
442
- llama_kv_cell & tail_src = cache.cells[seq_id_src];
443
- llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
444
- if (tail_dst.tail >= 0) {
445
- // clear destination seq_id if it wasn't empty
446
- llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
447
-
448
- cell_dst.seq_id.erase(seq_id_dst);
449
- tail_dst.tail = -1;
450
- if (cell_dst.seq_id.empty()) {
451
- cell_dst.pos = -1;
452
- cell_dst.delta = -1;
453
- cell_dst.src = -1;
454
- cache.used -= 1;
455
- }
456
- }
457
- if (tail_src.tail >= 0) {
458
- llama_kv_cell & cell_src = cache.cells[tail_src.tail];
459
-
460
- cell_src.seq_id.insert(seq_id_dst);
461
- tail_dst.tail = tail_src.tail;
462
- }
463
- }
464
-
465
- return;
466
- }
467
- // otherwise, this is the KV cache of a Transformer-like model
468
-
469
- cache.head = 0;
470
-
471
- for (uint32_t i = 0; i < cache.size; ++i) {
472
- if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
473
- cache.cells[i].seq_id.insert(seq_id_dst);
474
- }
475
- }
476
- }
477
-
478
- void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
479
- uint32_t new_head = cache.size;
480
-
481
- for (uint32_t i = 0; i < cache.size; ++i) {
482
- if (cache.recurrent && (llama_seq_id) i != seq_id) {
483
- cache.cells[i].tail = -1;
484
- }
485
- if (!cache.cells[i].has_seq_id(seq_id)) {
486
- if (cache.cells[i].pos >= 0) cache.used--;
487
- cache.cells[i].pos = -1;
488
- cache.cells[i].src = -1;
489
- cache.cells[i].seq_id.clear();
490
- if (new_head == cache.size) new_head = i;
491
- } else {
492
- cache.cells[i].seq_id.clear();
493
- cache.cells[i].seq_id.insert(seq_id);
494
- }
495
- }
496
-
497
- // If we freed up a slot, set head to it so searching can start there.
498
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
499
- }
500
-
501
- void llama_kv_cache_seq_add(
502
- struct llama_kv_cache & cache,
503
- llama_seq_id seq_id,
504
- llama_pos p0,
505
- llama_pos p1,
506
- llama_pos delta) {
507
- uint32_t new_head = cache.size;
508
-
509
- if (p0 < 0) p0 = 0;
510
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
511
- // If there is no range then return early to avoid looping over the cache.
512
- if (p0 == p1) return;
513
-
514
- if (cache.recurrent) {
515
- // for Mamba-like or RWKV models, only the pos needs to be shifted
516
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
517
- const int32_t tail_id = cache.cells[seq_id].tail;
518
- if (tail_id >= 0) {
519
- llama_kv_cell & cell = cache.cells[tail_id];
520
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
521
- cell.pos += delta;
522
- }
523
- }
524
- }
525
- return;
526
- }
527
-
528
- for (uint32_t i = 0; i < cache.size; ++i) {
529
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
530
- cache.has_shift = true;
531
- cache.cells[i].pos += delta;
532
- cache.cells[i].delta += delta;
533
-
534
- if (cache.cells[i].pos < 0) {
535
- if (!cache.cells[i].is_empty()) {
536
- cache.used--;
537
- }
538
- cache.cells[i].pos = -1;
539
- cache.cells[i].seq_id.clear();
540
- if (new_head == cache.size) {
541
- new_head = i;
542
- }
543
- }
544
- }
545
- }
546
-
547
- // If we freed up a slot, set head to it so searching can start there.
548
- // Otherwise we just start the next search from the beginning.
549
- cache.head = new_head != cache.size ? new_head : 0;
550
- }
551
-
552
- void llama_kv_cache_seq_div(
553
- struct llama_kv_cache & cache,
554
- llama_seq_id seq_id,
555
- llama_pos p0,
556
- llama_pos p1,
557
- int d) {
558
- if (p0 < 0) p0 = 0;
559
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
560
- // If there is no range then return early to avoid looping over the cache.
561
- if (p0 == p1) return;
562
-
563
- if (cache.recurrent) {
564
- // for Mamba-like or RWKV models, only the pos needs to be changed
565
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
566
- const int32_t tail_id = cache.cells[seq_id].tail;
567
- if (tail_id >= 0) {
568
- llama_kv_cell & cell = cache.cells[tail_id];
569
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
570
- cell.pos /= d;
571
- }
572
- }
573
- }
574
- return;
575
- }
576
-
577
- for (uint32_t i = 0; i < cache.size; ++i) {
578
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
579
- cache.has_shift = true;
580
-
581
- {
582
- llama_pos p_old = cache.cells[i].pos;
583
- cache.cells[i].pos /= d;
584
- cache.cells[i].delta += cache.cells[i].pos - p_old;
585
- }
586
- }
587
- }
588
- }
589
-
590
- llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
591
- llama_pos result = 0;
592
-
593
- for (uint32_t i = 0; i < cache.size; ++i) {
594
- if (cache.cells[i].has_seq_id(seq_id)) {
595
- result = std::max(result, cache.cells[i].pos);
596
- }
597
- }
598
-
599
- return result;
600
- }
601
-
602
- void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
603
- if (!cache.recurrent) {
604
- cache.do_defrag = true;
605
- }
606
- }
607
-
608
- int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
609
- int result = 0;
610
-
611
- for (uint32_t i = 0; i < kv.size; i++) {
612
- result += kv.cells[i].seq_id.size();
613
- }
614
-
615
- return result;
616
- }
617
-
618
- int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
619
- return kv.used;
620
- }
621
-
622
- bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
623
- return kv.can_shift;
624
- }
625
-
626
- //
627
- // kv cache view
628
- //
629
-
630
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
631
- struct llama_kv_cache_view result = {
632
- /*.n_cells = */ 0,
633
- /*.n_seq_max = */ n_seq_max,
634
- /*.token_count = */ 0,
635
- /*.used_cells = */ llama_get_kv_cache_used_cells(kv),
636
- /*.max_contiguous = */ 0,
637
- /*.max_contiguous_idx = */ -1,
638
- /*.cells = */ nullptr,
639
- /*.cells_sequences = */ nullptr,
640
- };
641
-
642
- return result;
643
- }
644
-
645
- void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
646
- if (view->cells != nullptr) {
647
- free(view->cells);
648
- view->cells = nullptr;
649
- }
650
- if (view->cells_sequences != nullptr) {
651
- free(view->cells_sequences);
652
- view->cells_sequences = nullptr;
653
- }
654
- }
655
-
656
- void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
657
- if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
658
- view->n_cells = int32_t(kv.size);
659
- void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
660
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
661
- view->cells = (struct llama_kv_cache_view_cell *)p;
662
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
663
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
664
- view->cells_sequences = (llama_seq_id *)p;
665
- }
666
-
667
- const std::vector<llama_kv_cell> & kv_cells = kv.cells;
668
- llama_kv_cache_view_cell * c_curr = view->cells;
669
- llama_seq_id * cs_curr = view->cells_sequences;
670
- int32_t used_cells = 0;
671
- int32_t token_count = 0;
672
- int32_t curr_contig_idx = -1;
673
- uint32_t max_contig = 0;
674
- int32_t max_contig_idx = -1;
675
-
676
- for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
677
- const size_t curr_size = kv_cells[i].seq_id.size();
678
- token_count += curr_size;
679
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
680
-
681
- if (curr_size > 0) {
682
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
683
- max_contig = i - curr_contig_idx;
684
- max_contig_idx = curr_contig_idx;
685
- }
686
- curr_contig_idx = -1;
687
- } else if (curr_contig_idx < 0) {
688
- curr_contig_idx = i;
689
- }
690
-
691
- int seq_idx = 0;
692
- for (const llama_seq_id it : kv_cells[i].seq_id) {
693
- if (seq_idx >= view->n_seq_max) {
694
- break;
695
- }
696
- cs_curr[seq_idx] = it;
697
- seq_idx++;
698
- }
699
- if (seq_idx != 0) {
700
- used_cells++;
701
- }
702
- for (; seq_idx < view->n_seq_max; seq_idx++) {
703
- cs_curr[seq_idx] = -1;
704
- }
705
- }
706
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
707
- max_contig_idx = curr_contig_idx;
708
- max_contig = kv_cells.size() - curr_contig_idx;
709
- }
710
- view->max_contiguous = max_contig;
711
- view->max_contiguous_idx = max_contig_idx;
712
- view->token_count = token_count;
713
- view->used_cells = used_cells;
714
- if (uint32_t(used_cells) != kv.used) {
715
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
716
- __func__, kv.used, used_cells);
717
- }
718
- }
1
+ #include "llama-kv-cache.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+
8
+ #include <algorithm>
9
+ #include <cassert>
10
+ #include <limits>
11
+ #include <map>
12
+ #include <stdexcept>
13
+
14
+ llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
15
+ }
16
+
17
+ bool llama_kv_cache_unified::init(
18
+ const llama_model & model,
19
+ const llama_cparams & cparams,
20
+ lm_ggml_type type_k,
21
+ lm_ggml_type type_v,
22
+ uint32_t kv_size,
23
+ bool offload) {
24
+ const int32_t n_layer = hparams.n_layer;
25
+
26
+ has_shift = false;
27
+
28
+ recurrent = llama_model_is_recurrent(&model);
29
+ v_trans = !recurrent && !cparams.flash_attn;
30
+ can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
31
+
32
+ LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
33
+ __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, can_shift);
34
+
35
+ head = 0;
36
+ size = kv_size;
37
+ used = 0;
38
+
39
+ this->type_k = type_k;
40
+ this->type_v = type_v;
41
+
42
+ cells.clear();
43
+ cells.resize(kv_size);
44
+
45
+ // create a context for each buffer type
46
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
47
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
48
+ auto it = ctx_map.find(buft);
49
+ if (it == ctx_map.end()) {
50
+ lm_ggml_init_params params = {
51
+ /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
52
+ /*.mem_buffer =*/ NULL,
53
+ /*.no_alloc =*/ true,
54
+ };
55
+
56
+ lm_ggml_context * ctx = lm_ggml_init(params);
57
+ if (!ctx) {
58
+ return nullptr;
59
+ }
60
+
61
+ ctx_map[buft] = ctx;
62
+ ctxs.emplace_back(ctx);
63
+
64
+ return ctx;
65
+ }
66
+
67
+ return it->second;
68
+ };
69
+
70
+ k_l.reserve(n_layer);
71
+ v_l.reserve(n_layer);
72
+
73
+ for (int i = 0; i < n_layer; i++) {
74
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
75
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
76
+
77
+ const char * dev_name = "CPU";
78
+
79
+ lm_ggml_backend_buffer_type_t buft;
80
+ if (offload) {
81
+ auto * dev = model.dev_layer(i);
82
+ buft = lm_ggml_backend_dev_buffer_type(dev);
83
+
84
+ dev_name = lm_ggml_backend_dev_name(dev);
85
+ } else {
86
+ buft = lm_ggml_backend_cpu_buffer_type();
87
+ }
88
+
89
+ LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
90
+ i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
91
+
92
+ lm_ggml_context * ctx = ctx_for_buft(buft);
93
+ if (!ctx) {
94
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
95
+ return false;
96
+ }
97
+
98
+ lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
99
+ lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
100
+ lm_ggml_format_name(k, "cache_k_l%d", i);
101
+ lm_ggml_format_name(v, "cache_v_l%d", i);
102
+ k_l.push_back(k);
103
+ v_l.push_back(v);
104
+ }
105
+
106
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
107
+ for (auto it : ctx_map) {
108
+ auto * buft = it.first;
109
+ auto * ctx = it.second;
110
+
111
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
112
+ if (!buf) {
113
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
114
+ return false;
115
+ }
116
+ lm_ggml_backend_buffer_clear(buf, 0);
117
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
118
+ bufs.emplace_back(buf);
119
+ }
120
+
121
+ return true;
122
+ }
123
+
124
+ int32_t llama_kv_cache_unified::get_n_tokens() const {
125
+ int32_t result = 0;
126
+
127
+ for (uint32_t i = 0; i < size; i++) {
128
+ result += cells[i].seq_id.size();
129
+ }
130
+
131
+ return result;
132
+ }
133
+
134
+ int32_t llama_kv_cache_unified::get_used_cells() const {
135
+ return used;
136
+ }
137
+
138
+ size_t llama_kv_cache_unified::total_size() const {
139
+ size_t size = 0;
140
+ for (const auto & buf : bufs) {
141
+ size += lm_ggml_backend_buffer_get_size(buf.get());
142
+ }
143
+
144
+ return size;
145
+ }
146
+
147
+ llama_pos llama_kv_cache_unified::pos_max() const {
148
+ llama_pos pos_max = -1;
149
+ for (const auto & cell : cells) {
150
+ pos_max = std::max(pos_max, cell.pos);
151
+ }
152
+
153
+ return pos_max;
154
+ }
155
+
156
+ void llama_kv_cache_unified::clear() {
157
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
158
+ cells[i].pos = -1;
159
+ cells[i].seq_id.clear();
160
+ cells[i].src = -1;
161
+ cells[i].tail = -1;
162
+ }
163
+ head = 0;
164
+ used = 0;
165
+
166
+ for (auto & buf : bufs) {
167
+ lm_ggml_backend_buffer_clear(buf.get(), 0);
168
+ }
169
+ }
170
+
171
+ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
172
+ uint32_t new_head = size;
173
+
174
+ if (p0 < 0) {
175
+ p0 = 0;
176
+ }
177
+
178
+ if (p1 < 0) {
179
+ p1 = std::numeric_limits<llama_pos>::max();
180
+ }
181
+
182
+ // models like Mamba or RWKV can't have a state partially erased
183
+ if (recurrent) {
184
+ if (seq_id >= (int64_t) size) {
185
+ // could be fatal
186
+ return false;
187
+ }
188
+ if (0 <= seq_id) {
189
+ int32_t & tail_id = cells[seq_id].tail;
190
+ if (tail_id >= 0) {
191
+ const llama_kv_cell & cell = cells[tail_id];
192
+ // partial intersection is invalid
193
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
194
+ return false;
195
+ }
196
+ // invalidate tails which will be cleared
197
+ if (p0 <= cell.pos && cell.pos < p1) {
198
+ tail_id = -1;
199
+ }
200
+ }
201
+ } else {
202
+ // seq_id is negative, then the range should include everything or nothing
203
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
204
+ return false;
205
+ }
206
+ }
207
+
208
+ return true;
209
+ }
210
+
211
+ for (uint32_t i = 0; i < size; ++i) {
212
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
213
+ if (seq_id < 0) {
214
+ cells[i].seq_id.clear();
215
+ } else if (cells[i].has_seq_id(seq_id)) {
216
+ cells[i].seq_id.erase(seq_id);
217
+ } else {
218
+ continue;
219
+ }
220
+ if (cells[i].is_empty()) {
221
+ // keep count of the number of used cells
222
+ if (cells[i].pos >= 0) {
223
+ used--;
224
+ }
225
+
226
+ cells[i].pos = -1;
227
+ cells[i].src = -1;
228
+
229
+ if (new_head == size) {
230
+ new_head = i;
231
+ }
232
+ }
233
+ }
234
+ }
235
+
236
+ // If we freed up a slot, set head to it so searching can start there.
237
+ if (new_head != size && new_head < head) {
238
+ head = new_head;
239
+ }
240
+
241
+ return true;
242
+ }
243
+
244
+ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
245
+ if (seq_id_src == seq_id_dst) {
246
+ return;
247
+ }
248
+
249
+ if (p0 < 0) {
250
+ p0 = 0;
251
+ }
252
+
253
+ if (p1 < 0) {
254
+ p1 = std::numeric_limits<llama_pos>::max();
255
+ }
256
+
257
+ if (recurrent) {
258
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
259
+ llama_kv_cell & tail_src = cells[seq_id_src];
260
+ llama_kv_cell & tail_dst = cells[seq_id_dst];
261
+ if (tail_dst.tail >= 0) {
262
+ // clear destination seq_id if it wasn't empty
263
+ llama_kv_cell & cell_dst = cells[tail_dst.tail];
264
+
265
+ cell_dst.seq_id.erase(seq_id_dst);
266
+ tail_dst.tail = -1;
267
+ if (cell_dst.seq_id.empty()) {
268
+ cell_dst.pos = -1;
269
+ cell_dst.delta = -1;
270
+ cell_dst.src = -1;
271
+ used -= 1;
272
+ }
273
+ }
274
+ if (tail_src.tail >= 0) {
275
+ llama_kv_cell & cell_src = cells[tail_src.tail];
276
+
277
+ cell_src.seq_id.insert(seq_id_dst);
278
+ tail_dst.tail = tail_src.tail;
279
+ }
280
+ }
281
+
282
+ return;
283
+ }
284
+
285
+ // otherwise, this is the KV of a Transformer-like model
286
+ head = 0;
287
+
288
+ for (uint32_t i = 0; i < size; ++i) {
289
+ if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
290
+ cells[i].seq_id.insert(seq_id_dst);
291
+ }
292
+ }
293
+ }
294
+
295
+ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
296
+ uint32_t new_head = size;
297
+
298
+ for (uint32_t i = 0; i < size; ++i) {
299
+ if (recurrent && (llama_seq_id) i != seq_id) {
300
+ cells[i].tail = -1;
301
+ }
302
+
303
+ if (!cells[i].has_seq_id(seq_id)) {
304
+ if (cells[i].pos >= 0) {
305
+ used--;
306
+ }
307
+
308
+ cells[i].pos = -1;
309
+ cells[i].src = -1;
310
+ cells[i].seq_id.clear();
311
+
312
+ if (new_head == size){
313
+ new_head = i;
314
+ }
315
+ } else {
316
+ cells[i].seq_id.clear();
317
+ cells[i].seq_id.insert(seq_id);
318
+ }
319
+ }
320
+
321
+ // If we freed up a slot, set head to it so searching can start there.
322
+ if (new_head != size && new_head < head) {
323
+ head = new_head;
324
+ }
325
+ }
326
+
327
+ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
328
+ if (delta == 0) {
329
+ return;
330
+ }
331
+
332
+ uint32_t new_head = size;
333
+
334
+ if (p0 < 0) {
335
+ p0 = 0;
336
+ }
337
+
338
+ if (p1 < 0) {
339
+ p1 = std::numeric_limits<llama_pos>::max();
340
+ }
341
+
342
+ // If there is no range then return early to avoid looping over the
343
+ if (p0 == p1) {
344
+ return;
345
+ }
346
+
347
+ if (recurrent) {
348
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
349
+ if (0 <= seq_id && seq_id < (int64_t) size) {
350
+ const int32_t tail_id = cells[seq_id].tail;
351
+ if (tail_id >= 0) {
352
+ llama_kv_cell & cell = cells[tail_id];
353
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
354
+ cell.pos += delta;
355
+ }
356
+ }
357
+ }
358
+ return;
359
+ }
360
+
361
+ for (uint32_t i = 0; i < size; ++i) {
362
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
363
+ has_shift = true;
364
+ cells[i].pos += delta;
365
+ cells[i].delta += delta;
366
+
367
+ if (cells[i].pos < 0) {
368
+ if (!cells[i].is_empty()) {
369
+ used--;
370
+ }
371
+ cells[i].pos = -1;
372
+ cells[i].seq_id.clear();
373
+ if (new_head == size) {
374
+ new_head = i;
375
+ }
376
+ }
377
+ }
378
+ }
379
+
380
+ // If we freed up a slot, set head to it so searching can start there.
381
+ // Otherwise we just start the next search from the beginning.
382
+ head = new_head != size ? new_head : 0;
383
+ }
384
+
385
+ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
386
+ if (d == 1) {
387
+ return;
388
+ }
389
+
390
+ if (p0 < 0) {
391
+ p0 = 0;
392
+ }
393
+
394
+ if (p1 < 0) {
395
+ p1 = std::numeric_limits<llama_pos>::max();
396
+ }
397
+
398
+ // If there is no range then return early to avoid looping over the cache.
399
+ if (p0 == p1) {
400
+ return;
401
+ }
402
+
403
+ if (recurrent) {
404
+ // for Mamba-like or RWKV models, only the pos needs to be changed
405
+ if (0 <= seq_id && seq_id < (int64_t) size) {
406
+ const int32_t tail_id = cells[seq_id].tail;
407
+ if (tail_id >= 0) {
408
+ llama_kv_cell & cell = cells[tail_id];
409
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
410
+ cell.pos /= d;
411
+ }
412
+ }
413
+ }
414
+
415
+ return;
416
+ }
417
+
418
+ for (uint32_t i = 0; i < size; ++i) {
419
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
420
+ has_shift = true;
421
+
422
+ {
423
+ llama_pos p_old = cells[i].pos;
424
+ cells[i].pos /= d;
425
+ cells[i].delta += cells[i].pos - p_old;
426
+ }
427
+ }
428
+ }
429
+ }
430
+
431
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
432
+ llama_pos result = 0;
433
+
434
+ for (uint32_t i = 0; i < size; ++i) {
435
+ if (cells[i].has_seq_id(seq_id)) {
436
+ result = std::max(result, cells[i].pos);
437
+ }
438
+ }
439
+
440
+ return result;
441
+ }
442
+
443
+ void llama_kv_cache_unified::defrag() {
444
+ if (!recurrent) {
445
+ do_defrag = true;
446
+ }
447
+ }
448
+
449
+ void llama_kv_cache_unified::restore() {
450
+ if (pending.ranges.empty()) {
451
+ return;
452
+ }
453
+
454
+ // TODO: tmp - move to llama_kv_cache_recurrent
455
+ if (recurrent) {
456
+ seq_rm(-1, -1, -1);
457
+ return;
458
+ }
459
+
460
+ uint32_t new_head = size;
461
+
462
+ for (auto & range : pending.ranges) {
463
+ for (uint32_t i = range.c0; i < range.c1; ++i) {
464
+ cells[i].seq_id.clear();
465
+
466
+ // keep count of the number of used cells
467
+ if (cells[i].pos >= 0) {
468
+ used--;
469
+ }
470
+
471
+ cells[i].pos = -1;
472
+ cells[i].src = -1;
473
+ }
474
+
475
+ new_head = std::min(new_head, range.c0);
476
+ }
477
+
478
+ if (new_head != size && new_head < head) {
479
+ head = new_head;
480
+ }
481
+ }
482
+
483
+ void llama_kv_cache_unified::commit() {
484
+ // TODO: tmp - move to llama_kv_cache_recurrent
485
+ if (recurrent) {
486
+ return;
487
+ }
488
+
489
+ if (pending.ranges.empty()) {
490
+ LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
491
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
492
+ return;
493
+ }
494
+
495
+ pending.ranges.clear();
496
+ }
497
+
498
+ bool llama_kv_cache_unified::get_can_shift() const {
499
+ return can_shift;
500
+ }
501
+
502
+ bool llama_kv_cache_unified::find_slot(
503
+ const llama_ubatch & ubatch) {
504
+ const uint32_t n_tokens = ubatch.n_tokens;
505
+ const uint32_t n_seqs = ubatch.n_seqs;
506
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
507
+
508
+ // if we have enough unused cells before the current head ->
509
+ // better to start searching from the beginning of the cache, hoping to fill it
510
+ if (head > used + 2*ubatch.n_tokens) {
511
+ head = 0;
512
+ }
513
+
514
+ if (recurrent) {
515
+ // For recurrent state architectures (like Mamba or RWKV),
516
+ // each cache cell can store the state for a whole sequence.
517
+ // A slot should be always be contiguous.
518
+
519
+ // can only process batches with an equal number of new tokens in each sequence
520
+ LM_GGML_ASSERT(ubatch.equal_seqs);
521
+
522
+ int32_t min = size - 1;
523
+ int32_t max = 0;
524
+
525
+ // everything should fit if all seq_ids are smaller than the max
526
+ for (uint32_t s = 0; s < n_seqs; ++s) {
527
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
528
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
529
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
530
+
531
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
532
+ // too big seq_id
533
+ // TODO: would it be possible to resize the cache instead?
534
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
535
+ return false;
536
+ }
537
+ if (j > 0) {
538
+ llama_kv_cell & seq = cells[seq_id];
539
+ if (seq.tail >= 0) {
540
+ llama_kv_cell & cell = cells[seq.tail];
541
+ // clear cells from seq_ids that become shared
542
+ // (should not normally happen, but let's handle it anyway)
543
+ cell.seq_id.erase(seq_id);
544
+ seq.tail = -1;
545
+ if (cell.seq_id.empty()) {
546
+ cell.pos = -1;
547
+ cell.src = -1;
548
+ used -= 1;
549
+ }
550
+ }
551
+ }
552
+ }
553
+ }
554
+
555
+ #ifndef NDEBUG
556
+ {
557
+ std::vector<int32_t> tails_verif;
558
+ tails_verif.assign(size, -1);
559
+ for (uint32_t i = 0; i < size; ++i) {
560
+ llama_kv_cell & cell = cells[i];
561
+ for (llama_seq_id seq_id : cell.seq_id) {
562
+ if (tails_verif[seq_id] != -1) {
563
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
564
+ }
565
+ tails_verif[seq_id] = i;
566
+ }
567
+ }
568
+ for (uint32_t i = 0; i < size; ++i) {
569
+ if (tails_verif[i] != cells[i].tail) {
570
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
571
+ }
572
+ }
573
+ }
574
+ #endif
575
+
576
+ // find next empty cell
577
+ uint32_t next_empty_cell = head;
578
+
579
+ for (uint32_t i = 0; i < size; ++i) {
580
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
581
+ llama_kv_cell & cell = cells[next_empty_cell];
582
+ if (cell.is_empty()) { break; }
583
+ next_empty_cell += 1;
584
+ }
585
+
586
+ // find usable cell range
587
+ for (uint32_t s = 0; s < n_seqs; ++s) {
588
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
589
+ llama_kv_cell & seq_meta = cells[seq_id];
590
+ bool has_cell = false;
591
+ if (seq_meta.tail >= 0) {
592
+ llama_kv_cell & cell = cells[seq_meta.tail];
593
+ LM_GGML_ASSERT(cell.has_seq_id(seq_id));
594
+ // does this seq_id "own" the cell?
595
+ if (cell.seq_id.size() == 1) { has_cell = true; }
596
+ }
597
+ if (!has_cell) {
598
+ llama_kv_cell & empty_cell = cells[next_empty_cell];
599
+ LM_GGML_ASSERT(empty_cell.is_empty());
600
+ // copy old tail into the empty cell
601
+ if (seq_meta.tail >= 0) {
602
+ llama_kv_cell & orig_cell = cells[seq_meta.tail];
603
+ empty_cell.pos = orig_cell.pos;
604
+ empty_cell.src = orig_cell.src;
605
+ orig_cell.seq_id.erase(seq_id);
606
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
607
+ }
608
+ seq_meta.tail = next_empty_cell;
609
+ // find next empty cell
610
+ if (s + 1 < n_seqs) {
611
+ next_empty_cell += 1;
612
+ for (uint32_t i = 0; i < size; ++i) {
613
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
614
+ llama_kv_cell & cell = cells[next_empty_cell];
615
+ if (cell.is_empty()) { break; }
616
+ next_empty_cell += 1;
617
+ }
618
+ }
619
+ }
620
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
621
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
622
+ }
623
+
624
+ // gather and re-order
625
+ for (uint32_t s = 0; s < n_seqs; ++s) {
626
+ int32_t dst_id = s + min;
627
+ int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
628
+ if (dst_id != src_id) {
629
+ llama_kv_cell & dst_cell = cells[dst_id];
630
+ llama_kv_cell & src_cell = cells[src_id];
631
+
632
+ std::swap(dst_cell.pos, src_cell.pos);
633
+ std::swap(dst_cell.src, src_cell.src);
634
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
635
+
636
+ // swap tails (assuming they NEVER overlap)
637
+ for (const llama_seq_id seq_id : src_cell.seq_id) {
638
+ cells[seq_id].tail = src_id;
639
+ }
640
+ for (const llama_seq_id seq_id : dst_cell.seq_id) {
641
+ cells[seq_id].tail = dst_id;
642
+ }
643
+ }
644
+ }
645
+
646
+ // update the pos of the used seqs
647
+ for (uint32_t s = 0; s < n_seqs; ++s) {
648
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
649
+ int32_t cell_id = s + min;
650
+ llama_kv_cell & cell = cells[cell_id];
651
+
652
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
653
+ // What should happen when the pos backtracks or skips a value?
654
+ // Clearing the state mid-batch would require special-casing which isn't done.
655
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
656
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
657
+ }
658
+ cell.pos = last_pos;
659
+ cell.seq_id.clear();
660
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
661
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
662
+ cell.seq_id.insert(seq_id);
663
+ cells[seq_id].tail = cell_id;
664
+ }
665
+ }
666
+
667
+ // allow getting the range of used cells, from head to head + n
668
+ head = min;
669
+ n = max - min + 1;
670
+ used = std::count_if(cells.begin(), cells.end(),
671
+ [](const llama_kv_cell& cell){ return !cell.is_empty(); });
672
+
673
+ // sanity check
674
+ return n >= n_seqs;
675
+ }
676
+
677
+ // otherwise, one cell per token.
678
+
679
+ if (n_tokens > size) {
680
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
681
+ return false;
682
+ }
683
+
684
+ uint32_t n_tested = 0;
685
+
686
+ while (true) {
687
+ if (head + n_tokens > size) {
688
+ n_tested += size - head;
689
+ head = 0;
690
+ continue;
691
+ }
692
+
693
+ bool found = true;
694
+ for (uint32_t i = 0; i < n_tokens; i++) {
695
+ if (cells[head + i].pos >= 0) {
696
+ found = false;
697
+ head += i + 1;
698
+ n_tested += i + 1;
699
+ break;
700
+ }
701
+ }
702
+
703
+ if (found) {
704
+ break;
705
+ }
706
+
707
+ if (n_tested >= size) {
708
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
709
+ return false;
710
+ }
711
+ }
712
+
713
+ for (uint32_t s = 0; s < n_seqs; s++) {
714
+ for (uint32_t i = 0; i < n_seq_tokens; ++i) {
715
+ uint32_t k = s*n_seq_tokens + i;
716
+ cells[head + k].pos = ubatch.pos[k];
717
+
718
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
719
+ cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
720
+ }
721
+ }
722
+ }
723
+
724
+ used += n_tokens;
725
+
726
+ pending.ranges.push_back({head, head + n_tokens});
727
+
728
+ return true;
729
+ }
730
+
731
+ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
732
+ // the FA kernels require padding to avoid extra runtime boundary checks
733
+ return cparams.flash_attn ? 256u : 32u;
734
+ }
735
+
736
+ uint32_t llama_kv_cache_unified::cell_max() const {
737
+ for (uint32_t i = size; i > 0; --i) {
738
+ const llama_kv_cell & cell = cells[i - 1];
739
+
740
+ if (cell.pos >= 0 && !cell.is_empty()) {
741
+ return i;
742
+ }
743
+ }
744
+
745
+ return 0;
746
+ }
747
+
748
+ size_t llama_kv_cache_unified::size_k_bytes() const {
749
+ size_t size_k_bytes = 0;
750
+
751
+ for (const auto & k : k_l) {
752
+ size_k_bytes += lm_ggml_nbytes(k);
753
+ }
754
+
755
+ return size_k_bytes;
756
+ }
757
+
758
+ size_t llama_kv_cache_unified::size_v_bytes() const {
759
+ size_t size_v_bytes = 0;
760
+
761
+ for (const auto & v : v_l) {
762
+ size_v_bytes += lm_ggml_nbytes(v);
763
+ }
764
+
765
+ return size_v_bytes;
766
+ }
767
+
768
+ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
769
+ const uint32_t n_layer = hparams.n_layer;
770
+
771
+ const uint32_t n_kv = cell_max();
772
+ const uint32_t n_used = used;
773
+
774
+ assert(n_used <= n_kv);
775
+
776
+ //const int64_t t_start = lm_ggml_time_us();
777
+
778
+ // number of cells moved
779
+ uint32_t n_moves = 0;
780
+
781
+ // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
782
+ // - source view, destination view, copy operation
783
+ // - x2 for keys and values
784
+ //const uint32_t max_moves = max_nodes()/(6*n_layer);
785
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
786
+ const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
787
+
788
+ // determine which KV cells to move where
789
+ //
790
+ // cell i moves to ids[i]
791
+ //
792
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
793
+ //
794
+ auto & ids = defrag_info.ids;
795
+
796
+ ids.clear();
797
+ ids.resize(n_kv, n_kv);
798
+
799
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
800
+ const auto & cell0 = cells[i0];
801
+
802
+ if (!cell0.is_empty()) {
803
+ ids[i0] = i0;
804
+
805
+ continue;
806
+ }
807
+
808
+ // found a hole - fill it with data from the end of the cache
809
+
810
+ uint32_t nh = 1;
811
+
812
+ // determine the size of the hole
813
+ while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
814
+ nh++;
815
+ }
816
+
817
+ uint32_t nf = 0;
818
+ uint32_t is = n_kv - 1;
819
+
820
+ // starting from the end, find nh non-empty cells
821
+ for (; is > i0; --is) {
822
+ const auto & cell1 = cells[is];
823
+
824
+ if (cell1.is_empty() || ids[is] != n_kv) {
825
+ continue;
826
+ }
827
+
828
+ // non-empty cell which is not yet moved
829
+ nf++;
830
+
831
+ if (nf == nh) {
832
+ break;
833
+ }
834
+ }
835
+
836
+ // this can only happen if `n_used` is not accurate, which would be a bug
837
+ LM_GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
838
+
839
+ nf = 0;
840
+
841
+ uint32_t i1 = is;
842
+
843
+ // are we moving a continuous block of memory?
844
+ bool cont = false;
845
+
846
+ // should we stop searching for the next move?
847
+ bool stop = false;
848
+
849
+ // go back and move the nf cells to the hole
850
+ for (; i1 < n_kv; ++i1) {
851
+ auto & cell1 = cells[i1];
852
+
853
+ if (cell1.is_empty() || ids[i1] != n_kv) {
854
+ if (n_moves == max_moves) {
855
+ stop = true;
856
+ break;
857
+ }
858
+
859
+ cont = false;
860
+ continue;
861
+ }
862
+
863
+ // this cell goes to (i0 + nf)
864
+ ids[i1] = i0 + nf;
865
+
866
+ // move the cell meta data
867
+ cells[i0 + nf] = cell1;
868
+
869
+ // clear the old cell and move the head there
870
+ cell1 = llama_kv_cell();
871
+ head = n_used;
872
+
873
+ if (!cont) {
874
+ n_moves++;
875
+ cont = true;
876
+ }
877
+
878
+ nf++;
879
+
880
+ if (nf == nh) {
881
+ break;
882
+ }
883
+ }
884
+
885
+ if (stop || n_moves == max_moves) {
886
+ break;
887
+ }
888
+
889
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
890
+
891
+ i0 += nh - 1;
892
+ }
893
+
894
+ if (n_moves == 0) {
895
+ return false;
896
+ }
897
+
898
+ LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
899
+
900
+ LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
901
+
902
+ return true;
903
+ }
904
+
905
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
906
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
907
+ uint32_t cell_count = 0;
908
+
909
+ // Count the number of cells with the specified seq_id
910
+ // Find all the ranges of cells with this seq id (or all, when -1)
911
+ uint32_t cell_range_begin = size;
912
+ for (uint32_t i = 0; i < size; ++i) {
913
+ const auto & cell = cells[i];
914
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
915
+ ++cell_count;
916
+ if (cell_range_begin == size) {
917
+ cell_range_begin = i;
918
+ }
919
+ } else {
920
+ if (cell_range_begin != size) {
921
+ cell_ranges.emplace_back(cell_range_begin, i);
922
+ cell_range_begin = size;
923
+ }
924
+ }
925
+ }
926
+ if (cell_range_begin != size) {
927
+ cell_ranges.emplace_back(cell_range_begin, size);
928
+ }
929
+
930
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
931
+ uint32_t cell_count_check = 0;
932
+ for (const auto & range : cell_ranges) {
933
+ cell_count_check += range.second - range.first;
934
+ }
935
+ LM_GGML_ASSERT(cell_count == cell_count_check);
936
+
937
+ io.write(&cell_count, sizeof(cell_count));
938
+
939
+ state_write_meta(io, cell_ranges, seq_id);
940
+ state_write_data(io, cell_ranges);
941
+ }
942
+
943
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
944
+ uint32_t cell_count;
945
+ io.read_to(&cell_count, sizeof(cell_count));
946
+
947
+ bool res = true;
948
+ res = res && state_read_meta(io, cell_count, seq_id);
949
+ res = res && state_read_data(io, cell_count);
950
+
951
+ if (!res) {
952
+ if (seq_id == -1) {
953
+ clear();
954
+ } else {
955
+ seq_rm(seq_id, -1, -1);
956
+ }
957
+ throw std::runtime_error("failed to restore kv cache");
958
+ }
959
+ }
960
+
961
+ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
962
+ for (const auto & range : cell_ranges) {
963
+ for (uint32_t i = range.first; i < range.second; ++i) {
964
+ const auto & cell = cells[i];
965
+ const llama_pos pos = cell.pos;
966
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
967
+
968
+ io.write(&pos, sizeof(pos));
969
+ io.write(&n_seq_id, sizeof(n_seq_id));
970
+
971
+ if (n_seq_id) {
972
+ for (auto seq_id : cell.seq_id) {
973
+ io.write(&seq_id, sizeof(seq_id));
974
+ }
975
+ }
976
+ }
977
+ }
978
+ }
979
+
980
+ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
981
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
982
+ const uint32_t n_layer = hparams.n_layer;
983
+
984
+ io.write(&v_trans, sizeof(v_trans));
985
+ io.write(&n_layer, sizeof(n_layer));
986
+
987
+ std::vector<uint8_t> tmp_buf;
988
+
989
+ // Iterate and write all the keys first, each row is a cell
990
+ // Get whole range at a time
991
+ for (uint32_t il = 0; il < n_layer; ++il) {
992
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
993
+
994
+ // Write key type
995
+ const int32_t k_type_i = (int32_t)k_l[il]->type;
996
+ io.write(&k_type_i, sizeof(k_type_i));
997
+
998
+ // Write row size of key
999
+ const uint64_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1000
+ io.write(&k_size_row, sizeof(k_size_row));
1001
+
1002
+ // Read each range of cells of k_size length each into tmp_buf and write out
1003
+ for (const auto & range : cell_ranges) {
1004
+ const size_t range_size = range.second - range.first;
1005
+ const size_t buf_size = range_size * k_size_row;
1006
+ io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
1007
+ }
1008
+ }
1009
+
1010
+ if (!v_trans) {
1011
+ for (uint32_t il = 0; il < n_layer; ++il) {
1012
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1013
+
1014
+ // Write value type
1015
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1016
+ io.write(&v_type_i, sizeof(v_type_i));
1017
+
1018
+ // Write row size of value
1019
+ const uint64_t v_size_row = lm_ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1020
+ io.write(&v_size_row, sizeof(v_size_row));
1021
+
1022
+ // Read each range of cells of v_size length each into tmp_buf and write out
1023
+ for (const auto & range : cell_ranges) {
1024
+ const size_t range_size = range.second - range.first;
1025
+ const size_t buf_size = range_size * v_size_row;
1026
+ io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
1027
+ }
1028
+ }
1029
+ } else {
1030
+ // When v is transposed, we also need the element size and get the element ranges from each row
1031
+ const uint32_t kv_size = size;
1032
+ for (uint32_t il = 0; il < n_layer; ++il) {
1033
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1034
+
1035
+ // Write value type
1036
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1037
+ io.write(&v_type_i, sizeof(v_type_i));
1038
+
1039
+ // Write element size
1040
+ const uint32_t v_size_el = lm_ggml_type_size(v_l[il]->type);
1041
+ io.write(&v_size_el, sizeof(v_size_el));
1042
+
1043
+ // Write GQA embedding size
1044
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1045
+
1046
+ // For each row, we get the element values of each cell
1047
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1048
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1049
+ for (const auto & range : cell_ranges) {
1050
+ const size_t range_size = range.second - range.first;
1051
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1052
+ const size_t buf_size = range_size * v_size_el;
1053
+ io.write_tensor(v_l[il], src_offset, buf_size);
1054
+ }
1055
+ }
1056
+ }
1057
+ }
1058
+ }
1059
+
1060
+ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1061
+ if (dest_seq_id != -1) {
1062
+ // single sequence
1063
+
1064
+ seq_rm(dest_seq_id, -1, -1);
1065
+
1066
+ llama_sbatch sbatch;
1067
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1068
+
1069
+ batch.n_tokens = cell_count;
1070
+ batch.n_seq_tokens = cell_count;
1071
+ batch.n_seqs = 1;
1072
+
1073
+ for (uint32_t i = 0; i < cell_count; ++i) {
1074
+ llama_pos pos;
1075
+ uint32_t n_seq_id;
1076
+
1077
+ io.read_to(&pos, sizeof(pos));
1078
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1079
+
1080
+ if (n_seq_id != 0) {
1081
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1082
+ return false;
1083
+ }
1084
+
1085
+ batch.pos[i] = pos;
1086
+ }
1087
+ batch.n_seq_id[0] = 1;
1088
+ batch.seq_id[0] = &dest_seq_id;
1089
+ if (!find_slot(batch)) {
1090
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1091
+ return false;
1092
+ }
1093
+ commit();
1094
+
1095
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
1096
+ // Assume that this is one contiguous block of cells
1097
+ LM_GGML_ASSERT(head + cell_count <= size);
1098
+ LM_GGML_ASSERT(cells[head].pos == batch.pos[0]);
1099
+ LM_GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
1100
+ LM_GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
1101
+ LM_GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
1102
+ } else {
1103
+ // whole KV cache restore
1104
+
1105
+ if (cell_count > size) {
1106
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1107
+ return false;
1108
+ }
1109
+
1110
+ clear();
1111
+
1112
+ for (uint32_t i = 0; i < cell_count; ++i) {
1113
+ llama_kv_cell & cell = cells[i];
1114
+
1115
+ llama_pos pos;
1116
+ uint32_t n_seq_id;
1117
+
1118
+ io.read_to(&pos, sizeof(pos));
1119
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1120
+
1121
+ cell.pos = pos;
1122
+
1123
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1124
+ llama_seq_id seq_id;
1125
+ io.read_to(&seq_id, sizeof(seq_id));
1126
+
1127
+ // TODO: llama_kv_cache_unified should have a notion of max sequences
1128
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1129
+ if (seq_id < 0) {
1130
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1131
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
1132
+ return false;
1133
+ }
1134
+
1135
+ cell.seq_id.insert(seq_id);
1136
+
1137
+ if (recurrent) {
1138
+ int32_t & tail = cells[seq_id].tail;
1139
+ if (tail != -1) {
1140
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
1141
+ return false;
1142
+ }
1143
+ tail = i;
1144
+ }
1145
+ }
1146
+ }
1147
+
1148
+ head = 0;
1149
+ used = cell_count;
1150
+ }
1151
+
1152
+ if (recurrent) {
1153
+ for (uint32_t i = 0; i < cell_count; ++i) {
1154
+ uint32_t cell_id = head + i;
1155
+ // make sure the recurrent states will keep their restored state
1156
+ cells[cell_id].src = cell_id;
1157
+ }
1158
+ }
1159
+
1160
+ return true;
1161
+ }
1162
+
1163
+ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1164
+ uint32_t v_trans;
1165
+ uint32_t n_layer;
1166
+ io.read_to(&v_trans, sizeof(v_trans));
1167
+ io.read_to(&n_layer, sizeof(n_layer));
1168
+
1169
+ if (n_layer != hparams.n_layer) {
1170
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
1171
+ return false;
1172
+ }
1173
+ if (cell_count > size) {
1174
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1175
+ return false;
1176
+ }
1177
+ if (v_trans != (bool) v_trans) {
1178
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1179
+ return false;
1180
+ }
1181
+
1182
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1183
+ for (uint32_t il = 0; il < n_layer; ++il) {
1184
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1185
+
1186
+ // Read type of key
1187
+ int32_t k_type_i_ref;
1188
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1189
+ const int32_t k_type_i = (int32_t) k_l[il]->type;
1190
+ if (k_type_i != k_type_i_ref) {
1191
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1192
+ return false;
1193
+ }
1194
+
1195
+ // Read row size of key
1196
+ uint64_t k_size_row_ref;
1197
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1198
+ const size_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1199
+ if (k_size_row != k_size_row_ref) {
1200
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1201
+ return false;
1202
+ }
1203
+
1204
+ if (cell_count) {
1205
+ // Read and set the keys for the whole cell range
1206
+ lm_ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1207
+ }
1208
+ }
1209
+
1210
+ if (!v_trans) {
1211
+ for (uint32_t il = 0; il < n_layer; ++il) {
1212
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1213
+
1214
+ // Read type of value
1215
+ int32_t v_type_i_ref;
1216
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1217
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1218
+ if (v_type_i != v_type_i_ref) {
1219
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1220
+ return false;
1221
+ }
1222
+
1223
+ // Read row size of value
1224
+ uint64_t v_size_row_ref;
1225
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1226
+ const size_t v_size_row = lm_ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1227
+ if (v_size_row != v_size_row_ref) {
1228
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1229
+ return false;
1230
+ }
1231
+
1232
+ if (cell_count) {
1233
+ // Read and set the values for the whole cell range
1234
+ lm_ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1235
+ }
1236
+ }
1237
+ } else {
1238
+ // For each layer, read the values for each cell (transposed)
1239
+ for (uint32_t il = 0; il < n_layer; ++il) {
1240
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1241
+
1242
+ // Read type of value
1243
+ int32_t v_type_i_ref;
1244
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1245
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1246
+ if (v_type_i != v_type_i_ref) {
1247
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1248
+ return false;
1249
+ }
1250
+
1251
+ // Read element size of value
1252
+ uint32_t v_size_el_ref;
1253
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1254
+ const size_t v_size_el = lm_ggml_type_size(v_l[il]->type);
1255
+ if (v_size_el != v_size_el_ref) {
1256
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1257
+ return false;
1258
+ }
1259
+
1260
+ // Read GQA embedding size
1261
+ uint32_t n_embd_v_gqa_ref;
1262
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1263
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1264
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1265
+ return false;
1266
+ }
1267
+
1268
+ if (cell_count) {
1269
+ // For each row in the transposed matrix, read the values for the whole cell range
1270
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1271
+ const size_t dst_offset = (head + j * size) * v_size_el;
1272
+ lm_ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1273
+ }
1274
+ }
1275
+ }
1276
+ }
1277
+
1278
+ return true;
1279
+ }
1280
+
1281
+ //
1282
+ // kv cache view
1283
+ //
1284
+
1285
+ llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
1286
+ llama_kv_cache_view result = {
1287
+ /*.n_cells = */ 0,
1288
+ /*.n_seq_max = */ n_seq_max,
1289
+ /*.token_count = */ 0,
1290
+ /*.used_cells = */ kv.get_used_cells(),
1291
+ /*.max_contiguous = */ 0,
1292
+ /*.max_contiguous_idx = */ -1,
1293
+ /*.cells = */ nullptr,
1294
+ /*.cells_sequences = */ nullptr,
1295
+ };
1296
+
1297
+ return result;
1298
+ }
1299
+
1300
+ void llama_kv_cache_view_free(llama_kv_cache_view * view) {
1301
+ if (view->cells != nullptr) {
1302
+ free(view->cells);
1303
+ view->cells = nullptr;
1304
+ }
1305
+ if (view->cells_sequences != nullptr) {
1306
+ free(view->cells_sequences);
1307
+ view->cells_sequences = nullptr;
1308
+ }
1309
+ }
1310
+
1311
+ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
1312
+ // TODO: rework this in the future, for now quick hack
1313
+ const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
1314
+ if (kvu == nullptr) {
1315
+ LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
1316
+ return;
1317
+ }
1318
+
1319
+ if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
1320
+ view->n_cells = int32_t(kvu->size);
1321
+ void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
1322
+ LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
1323
+ view->cells = (llama_kv_cache_view_cell *)p;
1324
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
1325
+ LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
1326
+ view->cells_sequences = (llama_seq_id *)p;
1327
+ }
1328
+
1329
+ const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
1330
+ llama_kv_cache_view_cell * c_curr = view->cells;
1331
+ llama_seq_id * cs_curr = view->cells_sequences;
1332
+ int32_t used_cells = 0;
1333
+ int32_t token_count = 0;
1334
+ int32_t curr_contig_idx = -1;
1335
+ uint32_t max_contig = 0;
1336
+ int32_t max_contig_idx = -1;
1337
+
1338
+ for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
1339
+ const size_t curr_size = kv_cells[i].seq_id.size();
1340
+ token_count += curr_size;
1341
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
1342
+
1343
+ if (curr_size > 0) {
1344
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
1345
+ max_contig = i - curr_contig_idx;
1346
+ max_contig_idx = curr_contig_idx;
1347
+ }
1348
+ curr_contig_idx = -1;
1349
+ } else if (curr_contig_idx < 0) {
1350
+ curr_contig_idx = i;
1351
+ }
1352
+
1353
+ int seq_idx = 0;
1354
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
1355
+ if (seq_idx >= view->n_seq_max) {
1356
+ break;
1357
+ }
1358
+ cs_curr[seq_idx] = it;
1359
+ seq_idx++;
1360
+ }
1361
+ if (seq_idx != 0) {
1362
+ used_cells++;
1363
+ }
1364
+ for (; seq_idx < view->n_seq_max; seq_idx++) {
1365
+ cs_curr[seq_idx] = -1;
1366
+ }
1367
+ }
1368
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
1369
+ max_contig_idx = curr_contig_idx;
1370
+ max_contig = kv_cells.size() - curr_contig_idx;
1371
+ }
1372
+ view->max_contiguous = max_contig;
1373
+ view->max_contiguous_idx = max_contig_idx;
1374
+ view->token_count = token_count;
1375
+ view->used_cells = used_cells;
1376
+ if (uint32_t(used_cells) != kvu->used) {
1377
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
1378
+ __func__, kvu->used, used_cells);
1379
+ }
1380
+ }