cui-llama.rn 1.4.4 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/android/src/main/CMakeLists.txt +2 -2
  2. package/android/src/main/jni.cpp +12 -10
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/chat-template.hpp +529 -529
  12. package/cpp/chat.cpp +959 -265
  13. package/cpp/chat.h +135 -0
  14. package/cpp/common.cpp +2064 -1996
  15. package/cpp/common.h +700 -744
  16. package/cpp/ggml-alloc.c +1039 -1030
  17. package/cpp/ggml-alloc.h +1 -1
  18. package/cpp/ggml-backend-impl.h +255 -255
  19. package/cpp/ggml-backend-reg.cpp +586 -582
  20. package/cpp/ggml-backend.cpp +2004 -2002
  21. package/cpp/ggml-backend.h +354 -354
  22. package/cpp/ggml-common.h +1851 -1851
  23. package/cpp/ggml-cpp.h +39 -39
  24. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  25. package/cpp/ggml-cpu-aarch64.h +8 -8
  26. package/cpp/ggml-cpu-impl.h +531 -380
  27. package/cpp/ggml-cpu-quants.c +12527 -11517
  28. package/cpp/ggml-cpu-traits.cpp +36 -36
  29. package/cpp/ggml-cpu-traits.h +38 -38
  30. package/cpp/ggml-cpu.c +15766 -14485
  31. package/cpp/ggml-cpu.cpp +655 -633
  32. package/cpp/ggml-cpu.h +138 -135
  33. package/cpp/ggml-impl.h +567 -567
  34. package/cpp/ggml-metal-impl.h +235 -0
  35. package/cpp/ggml-metal.h +66 -66
  36. package/cpp/ggml-metal.m +5146 -5002
  37. package/cpp/ggml-opt.cpp +854 -854
  38. package/cpp/ggml-opt.h +216 -216
  39. package/cpp/ggml-quants.c +5238 -5238
  40. package/cpp/ggml-threading.h +14 -14
  41. package/cpp/ggml.c +6529 -6524
  42. package/cpp/ggml.h +2198 -2194
  43. package/cpp/gguf.cpp +1329 -1329
  44. package/cpp/gguf.h +202 -202
  45. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  46. package/cpp/json-schema-to-grammar.h +21 -22
  47. package/cpp/json.hpp +24766 -24766
  48. package/cpp/llama-adapter.cpp +347 -347
  49. package/cpp/llama-adapter.h +74 -74
  50. package/cpp/llama-arch.cpp +1513 -1492
  51. package/cpp/llama-arch.h +403 -402
  52. package/cpp/llama-batch.cpp +368 -368
  53. package/cpp/llama-batch.h +88 -88
  54. package/cpp/llama-chat.cpp +588 -587
  55. package/cpp/llama-chat.h +53 -53
  56. package/cpp/llama-context.cpp +1775 -1775
  57. package/cpp/llama-context.h +128 -128
  58. package/cpp/llama-cparams.cpp +1 -1
  59. package/cpp/llama-cparams.h +37 -37
  60. package/cpp/llama-cpp.h +30 -30
  61. package/cpp/llama-grammar.cpp +1219 -1219
  62. package/cpp/llama-grammar.h +173 -164
  63. package/cpp/llama-hparams.cpp +71 -71
  64. package/cpp/llama-hparams.h +139 -139
  65. package/cpp/llama-impl.cpp +167 -167
  66. package/cpp/llama-impl.h +61 -61
  67. package/cpp/llama-kv-cache.cpp +718 -718
  68. package/cpp/llama-kv-cache.h +219 -218
  69. package/cpp/llama-mmap.cpp +600 -590
  70. package/cpp/llama-mmap.h +68 -68
  71. package/cpp/llama-model-loader.cpp +1124 -1124
  72. package/cpp/llama-model-loader.h +167 -167
  73. package/cpp/llama-model.cpp +4087 -4023
  74. package/cpp/llama-model.h +370 -370
  75. package/cpp/llama-sampling.cpp +2558 -2525
  76. package/cpp/llama-sampling.h +32 -32
  77. package/cpp/llama-vocab.cpp +3264 -3252
  78. package/cpp/llama-vocab.h +125 -125
  79. package/cpp/llama.cpp +10284 -10137
  80. package/cpp/llama.h +1354 -1340
  81. package/cpp/log.cpp +393 -423
  82. package/cpp/log.h +132 -132
  83. package/cpp/minja/chat-template.hpp +529 -0
  84. package/cpp/minja/minja.hpp +2915 -0
  85. package/cpp/minja.hpp +2915 -2883
  86. package/cpp/rn-llama.cpp +20 -37
  87. package/cpp/rn-llama.h +12 -2
  88. package/cpp/sampling.cpp +570 -532
  89. package/cpp/sgemm.cpp +2598 -2598
  90. package/cpp/sgemm.h +14 -14
  91. package/cpp/speculative.cpp +278 -277
  92. package/cpp/speculative.h +28 -28
  93. package/package.json +1 -1
  94. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  95. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  96. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  97. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  98. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  99. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  100. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  101. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  102. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  103. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  104. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  105. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  106. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  107. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  108. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  109. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  110. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  111. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  112. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  113. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  114. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  115. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  116. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  117. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  118. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  119. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  120. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  122. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  124. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  125. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  126. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  127. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  128. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  129. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  130. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  132. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  134. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  135. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  136. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  194. package/android/src/main/build-arm64/Makefile +0 -1862
  195. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  196. package/cpp/chat.hpp +0 -55
  197. package/cpp/rn-llama.hpp +0 -913
@@ -1,218 +1,219 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include "ggml-cpp.h"
6
-
7
- #include <set>
8
- #include <vector>
9
-
10
- struct llama_kv_cell {
11
- llama_pos pos = -1;
12
- llama_pos delta = 0;
13
- int32_t src = -1; // used by recurrent state models to copy states
14
- int32_t tail = -1;
15
-
16
- std::set<llama_seq_id> seq_id;
17
-
18
- bool has_seq_id(const llama_seq_id & id) const {
19
- return seq_id.find(id) != seq_id.end();
20
- }
21
-
22
- bool is_empty() const {
23
- return seq_id.empty();
24
- }
25
-
26
- bool is_same_seq(const llama_kv_cell & other) const {
27
- return seq_id == other.seq_id;
28
- }
29
- };
30
-
31
- // ring-buffer of cached KV data
32
- struct llama_kv_cache {
33
- bool has_shift = false;
34
- bool do_defrag = false;
35
- bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
36
- bool v_trans = true; // the value tensor is transposed
37
- bool can_shift = false;
38
-
39
- // Note: The value of head isn't only used to optimize searching
40
- // for a free KV slot. llama_decode_impl also uses it, so it
41
- // cannot be freely changed after a slot has been allocated.
42
- uint32_t head = 0;
43
- uint32_t size = 0;
44
- uint32_t used = 0; // used cells (i.e. at least one seq_id)
45
-
46
- // computed before each graph build
47
- uint32_t n = 0;
48
-
49
- lm_ggml_type type_k = LM_GGML_TYPE_F16;
50
- lm_ggml_type type_v = LM_GGML_TYPE_F16;
51
-
52
- std::vector<llama_kv_cell> cells;
53
-
54
- std::vector<struct lm_ggml_tensor *> k_l; // per layer
55
- std::vector<struct lm_ggml_tensor *> v_l;
56
-
57
- std::vector<lm_ggml_context_ptr> ctxs;
58
- std::vector<lm_ggml_backend_buffer_ptr> bufs;
59
-
60
- size_t total_size() const {
61
- size_t size = 0;
62
- for (const auto & buf : bufs) {
63
- size += lm_ggml_backend_buffer_get_size(buf.get());
64
- }
65
-
66
- return size;
67
- }
68
-
69
- // TODO: better data structures to reduce the cost of this operation
70
- llama_pos max_pos() const {
71
- llama_pos max_pos = -1;
72
- for (const auto & cell : cells) {
73
- max_pos = std::max(max_pos, cell.pos);
74
- }
75
-
76
- return max_pos;
77
- }
78
- };
79
-
80
- // a structure holds information about the slot found in llama_kv_cache_find_slot
81
- struct llama_kv_cache_slot_info {
82
- std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
83
- bool found = false; // the slot was found
84
-
85
- explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
86
- llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
87
-
88
- operator bool() const { return found; }
89
- };
90
-
91
- // TODO: maybe not needed
92
- uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
93
-
94
- bool llama_kv_cache_init(
95
- struct llama_kv_cache & cache,
96
- const llama_model & model,
97
- const llama_cparams & cparams,
98
- lm_ggml_type type_k,
99
- lm_ggml_type type_v,
100
- uint32_t kv_size,
101
- bool offload);
102
-
103
- // find an empty slot of size "n_tokens" in the cache
104
- // updates the cache head
105
- // returns a structure holding information about the slot found
106
- // Note: On success, it's important that cache.head points
107
- // to the first cell of the slot.
108
- struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
109
- struct llama_kv_cache & cache,
110
- const struct llama_ubatch & batch);
111
-
112
- // find how many cells are currently in use
113
- uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
114
-
115
- void llama_kv_cache_clear(struct llama_kv_cache & cache);
116
-
117
- bool llama_kv_cache_seq_rm(
118
- struct llama_kv_cache & cache,
119
- llama_seq_id seq_id,
120
- llama_pos p0,
121
- llama_pos p1);
122
-
123
- void llama_kv_cache_seq_cp(
124
- struct llama_kv_cache & cache,
125
- llama_seq_id seq_id_src,
126
- llama_seq_id seq_id_dst,
127
- llama_pos p0,
128
- llama_pos p1);
129
-
130
- void llama_kv_cache_seq_keep(
131
- struct llama_kv_cache & cache,
132
- llama_seq_id seq_id);
133
-
134
- void llama_kv_cache_seq_add(
135
- struct llama_kv_cache & cache,
136
- llama_seq_id seq_id,
137
- llama_pos p0,
138
- llama_pos p1,
139
- llama_pos delta);
140
-
141
- void llama_kv_cache_seq_div(
142
- struct llama_kv_cache & cache,
143
- llama_seq_id seq_id,
144
- llama_pos p0,
145
- llama_pos p1,
146
- int d);
147
-
148
- llama_pos llama_kv_cache_seq_pos_max(
149
- struct llama_kv_cache & cache,
150
- llama_seq_id seq_id);
151
-
152
- void llama_kv_cache_defrag(struct llama_kv_cache & cache);
153
-
154
- int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
155
-
156
- int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
157
-
158
- bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
159
-
160
- //
161
- // kv cache view
162
- //
163
-
164
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
165
-
166
- void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
167
-
168
- //
169
- // kv cache restore
170
- //
171
-
172
- // saves the kv_cache state for future recovery.
173
- // used to rollback llama_kv_cache_find_slot changes.
174
- struct llama_kv_slot_restorer {
175
- struct llama_kv_cache_state {
176
- uint32_t head = 0;
177
- uint32_t n = 0;
178
- } old_state;
179
-
180
- // for non-recurrent models only
181
- // list of slots to restore
182
- std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
183
-
184
- bool do_restore = false;
185
-
186
- explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
187
- old_state.head = cache.head;
188
- old_state.n = cache.n;
189
- }
190
-
191
- // saves a slot information for future restoration
192
- void save(const struct llama_kv_cache_slot_info & slot) {
193
- if (slot) {
194
- do_restore = true;
195
- if (slot.boundaries.first != slot.boundaries.second) {
196
- slot_boundaries.push_back(slot.boundaries);
197
- }
198
- }
199
- }
200
-
201
- // must be explicitly called to restore the kv_cache state
202
- // and rollback changes from all llama_kv_cache_find_slot calls
203
- void restore(struct llama_kv_cache & cache) {
204
- if (do_restore) {
205
- cache.head = old_state.head;
206
- cache.n = old_state.n;
207
-
208
- if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
209
- llama_kv_cache_seq_rm(cache, -1, -1, -1);
210
- } else {
211
- for (auto & slot : slot_boundaries) {
212
- llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
213
- }
214
- }
215
- }
216
- }
217
- };
218
-
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include "ggml-cpp.h"
6
+
7
+ #include <set>
8
+ #include <vector>
9
+ #include <algorithm>
10
+
11
+ struct llama_kv_cell {
12
+ llama_pos pos = -1;
13
+ llama_pos delta = 0;
14
+ int32_t src = -1; // used by recurrent state models to copy states
15
+ int32_t tail = -1;
16
+
17
+ std::set<llama_seq_id> seq_id;
18
+
19
+ bool has_seq_id(const llama_seq_id & id) const {
20
+ return seq_id.find(id) != seq_id.end();
21
+ }
22
+
23
+ bool is_empty() const {
24
+ return seq_id.empty();
25
+ }
26
+
27
+ bool is_same_seq(const llama_kv_cell & other) const {
28
+ return seq_id == other.seq_id;
29
+ }
30
+ };
31
+
32
+ // ring-buffer of cached KV data
33
+ struct llama_kv_cache {
34
+ bool has_shift = false;
35
+ bool do_defrag = false;
36
+ bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
37
+ bool v_trans = true; // the value tensor is transposed
38
+ bool can_shift = false;
39
+
40
+ // Note: The value of head isn't only used to optimize searching
41
+ // for a free KV slot. llama_decode_impl also uses it, so it
42
+ // cannot be freely changed after a slot has been allocated.
43
+ uint32_t head = 0;
44
+ uint32_t size = 0;
45
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
46
+
47
+ // computed before each graph build
48
+ uint32_t n = 0;
49
+
50
+ lm_ggml_type type_k = LM_GGML_TYPE_F16;
51
+ lm_ggml_type type_v = LM_GGML_TYPE_F16;
52
+
53
+ std::vector<llama_kv_cell> cells;
54
+
55
+ std::vector<struct lm_ggml_tensor *> k_l; // per layer
56
+ std::vector<struct lm_ggml_tensor *> v_l;
57
+
58
+ std::vector<lm_ggml_context_ptr> ctxs;
59
+ std::vector<lm_ggml_backend_buffer_ptr> bufs;
60
+
61
+ size_t total_size() const {
62
+ size_t size = 0;
63
+ for (const auto & buf : bufs) {
64
+ size += lm_ggml_backend_buffer_get_size(buf.get());
65
+ }
66
+
67
+ return size;
68
+ }
69
+
70
+ // TODO: better data structures to reduce the cost of this operation
71
+ llama_pos max_pos() const {
72
+ llama_pos max_pos = -1;
73
+ for (const auto & cell : cells) {
74
+ max_pos = std::max(max_pos, cell.pos);
75
+ }
76
+
77
+ return max_pos;
78
+ }
79
+ };
80
+
81
+ // a structure holds information about the slot found in llama_kv_cache_find_slot
82
+ struct llama_kv_cache_slot_info {
83
+ std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
84
+ bool found = false; // the slot was found
85
+
86
+ explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
87
+ llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
88
+
89
+ operator bool() const { return found; }
90
+ };
91
+
92
+ // TODO: maybe not needed
93
+ uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
94
+
95
+ bool llama_kv_cache_init(
96
+ struct llama_kv_cache & cache,
97
+ const llama_model & model,
98
+ const llama_cparams & cparams,
99
+ lm_ggml_type type_k,
100
+ lm_ggml_type type_v,
101
+ uint32_t kv_size,
102
+ bool offload);
103
+
104
+ // find an empty slot of size "n_tokens" in the cache
105
+ // updates the cache head
106
+ // returns a structure holding information about the slot found
107
+ // Note: On success, it's important that cache.head points
108
+ // to the first cell of the slot.
109
+ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
110
+ struct llama_kv_cache & cache,
111
+ const struct llama_ubatch & batch);
112
+
113
+ // find how many cells are currently in use
114
+ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
115
+
116
+ void llama_kv_cache_clear(struct llama_kv_cache & cache);
117
+
118
+ bool llama_kv_cache_seq_rm(
119
+ struct llama_kv_cache & cache,
120
+ llama_seq_id seq_id,
121
+ llama_pos p0,
122
+ llama_pos p1);
123
+
124
+ void llama_kv_cache_seq_cp(
125
+ struct llama_kv_cache & cache,
126
+ llama_seq_id seq_id_src,
127
+ llama_seq_id seq_id_dst,
128
+ llama_pos p0,
129
+ llama_pos p1);
130
+
131
+ void llama_kv_cache_seq_keep(
132
+ struct llama_kv_cache & cache,
133
+ llama_seq_id seq_id);
134
+
135
+ void llama_kv_cache_seq_add(
136
+ struct llama_kv_cache & cache,
137
+ llama_seq_id seq_id,
138
+ llama_pos p0,
139
+ llama_pos p1,
140
+ llama_pos delta);
141
+
142
+ void llama_kv_cache_seq_div(
143
+ struct llama_kv_cache & cache,
144
+ llama_seq_id seq_id,
145
+ llama_pos p0,
146
+ llama_pos p1,
147
+ int d);
148
+
149
+ llama_pos llama_kv_cache_seq_pos_max(
150
+ struct llama_kv_cache & cache,
151
+ llama_seq_id seq_id);
152
+
153
+ void llama_kv_cache_defrag(struct llama_kv_cache & cache);
154
+
155
+ int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
156
+
157
+ int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
158
+
159
+ bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
160
+
161
+ //
162
+ // kv cache view
163
+ //
164
+
165
+ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
166
+
167
+ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
168
+
169
+ //
170
+ // kv cache restore
171
+ //
172
+
173
+ // saves the kv_cache state for future recovery.
174
+ // used to rollback llama_kv_cache_find_slot changes.
175
+ struct llama_kv_slot_restorer {
176
+ struct llama_kv_cache_state {
177
+ uint32_t head = 0;
178
+ uint32_t n = 0;
179
+ } old_state;
180
+
181
+ // for non-recurrent models only
182
+ // list of slots to restore
183
+ std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
184
+
185
+ bool do_restore = false;
186
+
187
+ explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
188
+ old_state.head = cache.head;
189
+ old_state.n = cache.n;
190
+ }
191
+
192
+ // saves a slot information for future restoration
193
+ void save(const struct llama_kv_cache_slot_info & slot) {
194
+ if (slot) {
195
+ do_restore = true;
196
+ if (slot.boundaries.first != slot.boundaries.second) {
197
+ slot_boundaries.push_back(slot.boundaries);
198
+ }
199
+ }
200
+ }
201
+
202
+ // must be explicitly called to restore the kv_cache state
203
+ // and rollback changes from all llama_kv_cache_find_slot calls
204
+ void restore(struct llama_kv_cache & cache) {
205
+ if (do_restore) {
206
+ cache.head = old_state.head;
207
+ cache.n = old_state.n;
208
+
209
+ if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
210
+ llama_kv_cache_seq_rm(cache, -1, -1, -1);
211
+ } else {
212
+ for (auto & slot : slot_boundaries) {
213
+ llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
214
+ }
215
+ }
216
+ }
217
+ }
218
+ };
219
+