cui-llama.rn 1.4.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (366) hide show
  1. package/LICENSE +20 -20
  2. package/README.md +317 -319
  3. package/android/build.gradle +116 -116
  4. package/android/gradle.properties +5 -5
  5. package/android/src/main/AndroidManifest.xml +4 -4
  6. package/android/src/main/CMakeLists.txt +124 -117
  7. package/android/src/main/java/com/rnllama/LlamaContext.java +645 -645
  8. package/android/src/main/java/com/rnllama/RNLlama.java +695 -695
  9. package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -48
  10. package/android/src/main/jni-utils.h +100 -100
  11. package/android/src/main/jni.cpp +1263 -1245
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  14. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  15. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  16. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  17. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  20. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +135 -135
  21. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +136 -136
  22. package/cpp/README.md +4 -4
  23. package/cpp/binary-ops.cpp +158 -0
  24. package/cpp/binary-ops.h +16 -0
  25. package/cpp/chat.cpp +1769 -1779
  26. package/cpp/chat.h +9 -1
  27. package/cpp/common.cpp +20 -522
  28. package/cpp/common.h +13 -36
  29. package/cpp/cpu-common.h +72 -0
  30. package/cpp/ggml-common.h +12 -6
  31. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  32. package/cpp/ggml-cpu-impl.h +2 -21
  33. package/cpp/ggml-cpu-quants.c +904 -405
  34. package/cpp/ggml-cpu.c +909 -13237
  35. package/cpp/ggml-impl.h +50 -23
  36. package/cpp/ggml-llama-sim.metallib +0 -0
  37. package/cpp/ggml-llama.metallib +0 -0
  38. package/cpp/ggml-metal-impl.h +597 -523
  39. package/cpp/ggml-metal.m +798 -580
  40. package/cpp/ggml.c +92 -3
  41. package/cpp/ggml.h +30 -6
  42. package/cpp/gguf.cpp +1 -0
  43. package/cpp/llama-adapter.cpp +55 -20
  44. package/cpp/llama-adapter.h +11 -9
  45. package/cpp/llama-arch.cpp +217 -16
  46. package/cpp/llama-arch.h +25 -0
  47. package/cpp/llama-batch.h +2 -2
  48. package/cpp/llama-chat.cpp +54 -2
  49. package/cpp/llama-chat.h +3 -0
  50. package/cpp/llama-context.cpp +2294 -1238
  51. package/cpp/llama-context.h +214 -77
  52. package/cpp/llama-cparams.h +1 -0
  53. package/cpp/llama-graph.cpp +1695 -0
  54. package/cpp/llama-graph.h +592 -0
  55. package/cpp/llama-hparams.cpp +8 -0
  56. package/cpp/llama-hparams.h +17 -0
  57. package/cpp/llama-io.cpp +15 -0
  58. package/cpp/llama-io.h +35 -0
  59. package/cpp/llama-kv-cache.cpp +965 -303
  60. package/cpp/llama-kv-cache.h +145 -151
  61. package/cpp/llama-memory.cpp +1 -0
  62. package/cpp/llama-memory.h +21 -0
  63. package/cpp/llama-mmap.cpp +1 -1
  64. package/cpp/llama-model-loader.cpp +10 -5
  65. package/cpp/llama-model-loader.h +5 -3
  66. package/cpp/llama-model.cpp +9194 -201
  67. package/cpp/llama-model.h +40 -1
  68. package/cpp/llama-sampling.cpp +5 -0
  69. package/cpp/llama-vocab.cpp +36 -5
  70. package/cpp/llama.cpp +51 -9984
  71. package/cpp/llama.h +102 -22
  72. package/cpp/log.cpp +34 -0
  73. package/cpp/minja/chat-template.hpp +15 -7
  74. package/cpp/minja/minja.hpp +120 -94
  75. package/cpp/ops.cpp +8723 -0
  76. package/cpp/ops.h +128 -0
  77. package/cpp/rn-llama.cpp +873 -882
  78. package/cpp/rn-llama.h +138 -148
  79. package/cpp/sampling.cpp +3 -0
  80. package/cpp/sampling.h +107 -107
  81. package/cpp/sgemm.cpp +533 -88
  82. package/cpp/simd-mappings.h +888 -0
  83. package/cpp/speculative.cpp +4 -4
  84. package/cpp/unary-ops.cpp +186 -0
  85. package/cpp/unary-ops.h +28 -0
  86. package/cpp/unicode-data.cpp +7034 -7034
  87. package/cpp/unicode-data.h +20 -20
  88. package/cpp/unicode.cpp +849 -849
  89. package/cpp/unicode.h +66 -66
  90. package/cpp/vec.cpp +258 -0
  91. package/cpp/vec.h +802 -0
  92. package/ios/CMakeLists.txt +116 -105
  93. package/ios/RNLlama.h +7 -7
  94. package/ios/RNLlama.mm +418 -405
  95. package/ios/RNLlamaContext.h +57 -57
  96. package/ios/RNLlamaContext.mm +835 -819
  97. package/ios/rnllama.xcframework/Info.plist +74 -74
  98. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  99. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +143 -0
  100. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +677 -0
  101. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  102. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/gguf.h +202 -0
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +1434 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/log.h +132 -0
  143. package/{cpp → ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja}/chat-template.hpp +15 -7
  144. package/{cpp → ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja}/minja.hpp +120 -94
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +128 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sampling.h +107 -0
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/speculative.h +28 -0
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode.h +66 -0
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +802 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  184. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  191. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  192. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  193. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  194. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  195. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  196. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  197. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  198. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  199. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  200. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  201. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  202. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  203. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  204. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  205. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  206. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  207. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  208. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  209. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  210. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  211. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  212. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  213. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  214. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  215. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  216. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  217. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  218. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +143 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +677 -0
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  225. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  226. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  227. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  228. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  229. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  230. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  231. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  232. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  233. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  234. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  235. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  236. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  237. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  238. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  239. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  240. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/gguf.h +202 -0
  241. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  242. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  243. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  244. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  245. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  246. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  247. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  248. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  249. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  250. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  251. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  261. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  262. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +1434 -0
  263. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/log.h +132 -0
  264. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  265. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  266. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +128 -0
  267. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  268. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sampling.h +107 -0
  269. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  270. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/speculative.h +28 -0
  272. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  273. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  274. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode.h +66 -0
  275. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +802 -0
  276. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  277. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  278. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  279. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  280. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  281. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  282. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  283. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  284. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  285. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  286. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  287. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  288. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  289. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  290. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  291. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  292. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  293. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  294. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  295. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  296. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  297. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  298. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  299. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  300. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  301. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  302. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  303. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  304. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  305. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  306. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  307. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  308. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  309. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  310. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  311. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  312. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  313. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  314. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  315. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  316. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  317. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  318. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  319. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  320. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  321. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  322. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  323. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  324. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  325. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  326. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  327. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  328. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  329. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  330. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  331. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  332. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  333. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  334. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  335. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  336. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  337. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  338. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  339. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  340. package/jest/mock.js +203 -203
  341. package/lib/commonjs/NativeRNLlama.js +1 -2
  342. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  343. package/lib/commonjs/chat.js.map +1 -1
  344. package/lib/commonjs/grammar.js +12 -31
  345. package/lib/commonjs/grammar.js.map +1 -1
  346. package/lib/commonjs/index.js +47 -47
  347. package/lib/commonjs/index.js.map +1 -1
  348. package/lib/commonjs/package.json +1 -0
  349. package/lib/module/NativeRNLlama.js +2 -0
  350. package/lib/module/NativeRNLlama.js.map +1 -1
  351. package/lib/module/chat.js +2 -0
  352. package/lib/module/chat.js.map +1 -1
  353. package/lib/module/grammar.js +14 -31
  354. package/lib/module/grammar.js.map +1 -1
  355. package/lib/module/index.js +47 -45
  356. package/lib/module/index.js.map +1 -1
  357. package/lib/module/package.json +1 -0
  358. package/lib/typescript/NativeRNLlama.d.ts +6 -4
  359. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  360. package/lib/typescript/index.d.ts.map +1 -1
  361. package/llama-rn.podspec +48 -48
  362. package/package.json +233 -233
  363. package/src/NativeRNLlama.ts +426 -424
  364. package/src/chat.ts +44 -44
  365. package/src/grammar.ts +854 -854
  366. package/src/index.ts +495 -485
@@ -6,86 +6,90 @@
6
6
  #include "llama-model.h"
7
7
 
8
8
  #include <algorithm>
9
+ #include <cassert>
9
10
  #include <limits>
10
11
  #include <map>
12
+ #include <stdexcept>
11
13
 
12
- static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
13
-
14
- uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
15
- // the FA kernels require padding to avoid extra runtime boundary checks
16
- return cparams.flash_attn ? 256u : 32u;
14
+ llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
17
15
  }
18
16
 
19
- bool llama_kv_cache_init(
20
- struct llama_kv_cache & cache,
21
- const llama_model & model,
22
- const llama_cparams & cparams,
23
- lm_ggml_type type_k,
24
- lm_ggml_type type_v,
25
- uint32_t kv_size,
26
- bool offload) {
27
- const struct llama_hparams & hparams = model.hparams;
28
-
17
+ bool llama_kv_cache_unified::init(
18
+ const llama_model & model,
19
+ const llama_cparams & cparams,
20
+ lm_ggml_type type_k,
21
+ lm_ggml_type type_v,
22
+ uint32_t kv_size,
23
+ bool offload) {
29
24
  const int32_t n_layer = hparams.n_layer;
30
25
 
31
- cache.has_shift = false;
26
+ has_shift = false;
32
27
 
33
- cache.recurrent = llama_model_is_recurrent(&model);
34
- cache.v_trans = !cache.recurrent && !cparams.flash_attn;
35
- cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
28
+ recurrent = llama_model_is_recurrent(&model);
29
+ v_trans = !recurrent && !cparams.flash_attn;
30
+ can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
36
31
 
37
32
  LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
38
- __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, cache.can_shift);
33
+ __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, can_shift);
39
34
 
40
- cache.head = 0;
41
- cache.size = kv_size;
42
- cache.used = 0;
35
+ head = 0;
36
+ size = kv_size;
37
+ used = 0;
43
38
 
44
- cache.type_k = type_k;
45
- cache.type_v = type_v;
39
+ this->type_k = type_k;
40
+ this->type_v = type_v;
46
41
 
47
- cache.cells.clear();
48
- cache.cells.resize(kv_size);
42
+ cells.clear();
43
+ cells.resize(kv_size);
49
44
 
50
45
  // create a context for each buffer type
51
46
  std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
52
47
  auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
53
48
  auto it = ctx_map.find(buft);
54
49
  if (it == ctx_map.end()) {
55
- struct lm_ggml_init_params params = {
50
+ lm_ggml_init_params params = {
56
51
  /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
57
52
  /*.mem_buffer =*/ NULL,
58
53
  /*.no_alloc =*/ true,
59
54
  };
55
+
60
56
  lm_ggml_context * ctx = lm_ggml_init(params);
61
57
  if (!ctx) {
62
58
  return nullptr;
63
59
  }
60
+
64
61
  ctx_map[buft] = ctx;
65
- cache.ctxs.emplace_back(ctx);
62
+ ctxs.emplace_back(ctx);
63
+
66
64
  return ctx;
67
65
  }
66
+
68
67
  return it->second;
69
68
  };
70
69
 
71
- cache.k_l.reserve(n_layer);
72
- cache.v_l.reserve(n_layer);
70
+ k_l.reserve(n_layer);
71
+ v_l.reserve(n_layer);
73
72
 
74
73
  for (int i = 0; i < n_layer; i++) {
75
74
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
76
75
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
77
76
 
78
- LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
77
+ const char * dev_name = "CPU";
79
78
 
80
79
  lm_ggml_backend_buffer_type_t buft;
81
80
  if (offload) {
82
81
  auto * dev = model.dev_layer(i);
83
82
  buft = lm_ggml_backend_dev_buffer_type(dev);
83
+
84
+ dev_name = lm_ggml_backend_dev_name(dev);
84
85
  } else {
85
86
  buft = lm_ggml_backend_cpu_buffer_type();
86
87
  }
87
- lm_ggml_context * ctx = ctx_for_buft(buft);
88
88
 
89
+ LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
90
+ i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
91
+
92
+ lm_ggml_context * ctx = ctx_for_buft(buft);
89
93
  if (!ctx) {
90
94
  LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
91
95
  return false;
@@ -95,8 +99,8 @@ bool llama_kv_cache_init(
95
99
  lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
96
100
  lm_ggml_format_name(k, "cache_k_l%d", i);
97
101
  lm_ggml_format_name(v, "cache_v_l%d", i);
98
- cache.k_l.push_back(k);
99
- cache.v_l.push_back(v);
102
+ k_l.push_back(k);
103
+ v_l.push_back(v);
100
104
  }
101
105
 
102
106
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -111,20 +115,403 @@ bool llama_kv_cache_init(
111
115
  }
112
116
  lm_ggml_backend_buffer_clear(buf, 0);
113
117
  LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
114
- cache.bufs.emplace_back(buf);
118
+ bufs.emplace_back(buf);
119
+ }
120
+
121
+ return true;
122
+ }
123
+
124
+ int32_t llama_kv_cache_unified::get_n_tokens() const {
125
+ int32_t result = 0;
126
+
127
+ for (uint32_t i = 0; i < size; i++) {
128
+ result += cells[i].seq_id.size();
129
+ }
130
+
131
+ return result;
132
+ }
133
+
134
+ int32_t llama_kv_cache_unified::get_used_cells() const {
135
+ return used;
136
+ }
137
+
138
+ size_t llama_kv_cache_unified::total_size() const {
139
+ size_t size = 0;
140
+ for (const auto & buf : bufs) {
141
+ size += lm_ggml_backend_buffer_get_size(buf.get());
142
+ }
143
+
144
+ return size;
145
+ }
146
+
147
+ llama_pos llama_kv_cache_unified::pos_max() const {
148
+ llama_pos pos_max = -1;
149
+ for (const auto & cell : cells) {
150
+ pos_max = std::max(pos_max, cell.pos);
151
+ }
152
+
153
+ return pos_max;
154
+ }
155
+
156
+ void llama_kv_cache_unified::clear() {
157
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
158
+ cells[i].pos = -1;
159
+ cells[i].seq_id.clear();
160
+ cells[i].src = -1;
161
+ cells[i].tail = -1;
162
+ }
163
+ head = 0;
164
+ used = 0;
165
+
166
+ for (auto & buf : bufs) {
167
+ lm_ggml_backend_buffer_clear(buf.get(), 0);
168
+ }
169
+ }
170
+
171
+ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
172
+ uint32_t new_head = size;
173
+
174
+ if (p0 < 0) {
175
+ p0 = 0;
176
+ }
177
+
178
+ if (p1 < 0) {
179
+ p1 = std::numeric_limits<llama_pos>::max();
180
+ }
181
+
182
+ // models like Mamba or RWKV can't have a state partially erased
183
+ if (recurrent) {
184
+ if (seq_id >= (int64_t) size) {
185
+ // could be fatal
186
+ return false;
187
+ }
188
+ if (0 <= seq_id) {
189
+ int32_t & tail_id = cells[seq_id].tail;
190
+ if (tail_id >= 0) {
191
+ const llama_kv_cell & cell = cells[tail_id];
192
+ // partial intersection is invalid
193
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
194
+ return false;
195
+ }
196
+ // invalidate tails which will be cleared
197
+ if (p0 <= cell.pos && cell.pos < p1) {
198
+ tail_id = -1;
199
+ }
200
+ }
201
+ } else {
202
+ // seq_id is negative, then the range should include everything or nothing
203
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
204
+ return false;
205
+ }
206
+ }
207
+
208
+ return true;
209
+ }
210
+
211
+ for (uint32_t i = 0; i < size; ++i) {
212
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
213
+ if (seq_id < 0) {
214
+ cells[i].seq_id.clear();
215
+ } else if (cells[i].has_seq_id(seq_id)) {
216
+ cells[i].seq_id.erase(seq_id);
217
+ } else {
218
+ continue;
219
+ }
220
+ if (cells[i].is_empty()) {
221
+ // keep count of the number of used cells
222
+ if (cells[i].pos >= 0) {
223
+ used--;
224
+ }
225
+
226
+ cells[i].pos = -1;
227
+ cells[i].src = -1;
228
+
229
+ if (new_head == size) {
230
+ new_head = i;
231
+ }
232
+ }
233
+ }
234
+ }
235
+
236
+ // If we freed up a slot, set head to it so searching can start there.
237
+ if (new_head != size && new_head < head) {
238
+ head = new_head;
115
239
  }
116
240
 
117
241
  return true;
118
242
  }
119
243
 
120
- struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
121
- struct llama_kv_cache & cache,
122
- const struct llama_ubatch & ubatch) {
244
+ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
245
+ if (seq_id_src == seq_id_dst) {
246
+ return;
247
+ }
248
+
249
+ if (p0 < 0) {
250
+ p0 = 0;
251
+ }
252
+
253
+ if (p1 < 0) {
254
+ p1 = std::numeric_limits<llama_pos>::max();
255
+ }
256
+
257
+ if (recurrent) {
258
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
259
+ llama_kv_cell & tail_src = cells[seq_id_src];
260
+ llama_kv_cell & tail_dst = cells[seq_id_dst];
261
+ if (tail_dst.tail >= 0) {
262
+ // clear destination seq_id if it wasn't empty
263
+ llama_kv_cell & cell_dst = cells[tail_dst.tail];
264
+
265
+ cell_dst.seq_id.erase(seq_id_dst);
266
+ tail_dst.tail = -1;
267
+ if (cell_dst.seq_id.empty()) {
268
+ cell_dst.pos = -1;
269
+ cell_dst.delta = -1;
270
+ cell_dst.src = -1;
271
+ used -= 1;
272
+ }
273
+ }
274
+ if (tail_src.tail >= 0) {
275
+ llama_kv_cell & cell_src = cells[tail_src.tail];
276
+
277
+ cell_src.seq_id.insert(seq_id_dst);
278
+ tail_dst.tail = tail_src.tail;
279
+ }
280
+ }
281
+
282
+ return;
283
+ }
284
+
285
+ // otherwise, this is the KV of a Transformer-like model
286
+ head = 0;
287
+
288
+ for (uint32_t i = 0; i < size; ++i) {
289
+ if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
290
+ cells[i].seq_id.insert(seq_id_dst);
291
+ }
292
+ }
293
+ }
294
+
295
+ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
296
+ uint32_t new_head = size;
297
+
298
+ for (uint32_t i = 0; i < size; ++i) {
299
+ if (recurrent && (llama_seq_id) i != seq_id) {
300
+ cells[i].tail = -1;
301
+ }
302
+
303
+ if (!cells[i].has_seq_id(seq_id)) {
304
+ if (cells[i].pos >= 0) {
305
+ used--;
306
+ }
307
+
308
+ cells[i].pos = -1;
309
+ cells[i].src = -1;
310
+ cells[i].seq_id.clear();
311
+
312
+ if (new_head == size){
313
+ new_head = i;
314
+ }
315
+ } else {
316
+ cells[i].seq_id.clear();
317
+ cells[i].seq_id.insert(seq_id);
318
+ }
319
+ }
320
+
321
+ // If we freed up a slot, set head to it so searching can start there.
322
+ if (new_head != size && new_head < head) {
323
+ head = new_head;
324
+ }
325
+ }
326
+
327
+ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
328
+ if (delta == 0) {
329
+ return;
330
+ }
331
+
332
+ uint32_t new_head = size;
333
+
334
+ if (p0 < 0) {
335
+ p0 = 0;
336
+ }
337
+
338
+ if (p1 < 0) {
339
+ p1 = std::numeric_limits<llama_pos>::max();
340
+ }
341
+
342
+ // If there is no range then return early to avoid looping over the
343
+ if (p0 == p1) {
344
+ return;
345
+ }
346
+
347
+ if (recurrent) {
348
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
349
+ if (0 <= seq_id && seq_id < (int64_t) size) {
350
+ const int32_t tail_id = cells[seq_id].tail;
351
+ if (tail_id >= 0) {
352
+ llama_kv_cell & cell = cells[tail_id];
353
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
354
+ cell.pos += delta;
355
+ }
356
+ }
357
+ }
358
+ return;
359
+ }
360
+
361
+ for (uint32_t i = 0; i < size; ++i) {
362
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
363
+ has_shift = true;
364
+ cells[i].pos += delta;
365
+ cells[i].delta += delta;
366
+
367
+ if (cells[i].pos < 0) {
368
+ if (!cells[i].is_empty()) {
369
+ used--;
370
+ }
371
+ cells[i].pos = -1;
372
+ cells[i].seq_id.clear();
373
+ if (new_head == size) {
374
+ new_head = i;
375
+ }
376
+ }
377
+ }
378
+ }
379
+
380
+ // If we freed up a slot, set head to it so searching can start there.
381
+ // Otherwise we just start the next search from the beginning.
382
+ head = new_head != size ? new_head : 0;
383
+ }
384
+
385
+ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
386
+ if (d == 1) {
387
+ return;
388
+ }
389
+
390
+ if (p0 < 0) {
391
+ p0 = 0;
392
+ }
393
+
394
+ if (p1 < 0) {
395
+ p1 = std::numeric_limits<llama_pos>::max();
396
+ }
397
+
398
+ // If there is no range then return early to avoid looping over the cache.
399
+ if (p0 == p1) {
400
+ return;
401
+ }
402
+
403
+ if (recurrent) {
404
+ // for Mamba-like or RWKV models, only the pos needs to be changed
405
+ if (0 <= seq_id && seq_id < (int64_t) size) {
406
+ const int32_t tail_id = cells[seq_id].tail;
407
+ if (tail_id >= 0) {
408
+ llama_kv_cell & cell = cells[tail_id];
409
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
410
+ cell.pos /= d;
411
+ }
412
+ }
413
+ }
414
+
415
+ return;
416
+ }
417
+
418
+ for (uint32_t i = 0; i < size; ++i) {
419
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
420
+ has_shift = true;
421
+
422
+ {
423
+ llama_pos p_old = cells[i].pos;
424
+ cells[i].pos /= d;
425
+ cells[i].delta += cells[i].pos - p_old;
426
+ }
427
+ }
428
+ }
429
+ }
430
+
431
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
432
+ llama_pos result = 0;
433
+
434
+ for (uint32_t i = 0; i < size; ++i) {
435
+ if (cells[i].has_seq_id(seq_id)) {
436
+ result = std::max(result, cells[i].pos);
437
+ }
438
+ }
439
+
440
+ return result;
441
+ }
442
+
443
+ void llama_kv_cache_unified::defrag() {
444
+ if (!recurrent) {
445
+ do_defrag = true;
446
+ }
447
+ }
448
+
449
+ void llama_kv_cache_unified::restore() {
450
+ if (pending.ranges.empty()) {
451
+ return;
452
+ }
453
+
454
+ // TODO: tmp - move to llama_kv_cache_recurrent
455
+ if (recurrent) {
456
+ seq_rm(-1, -1, -1);
457
+ return;
458
+ }
459
+
460
+ uint32_t new_head = size;
461
+
462
+ for (auto & range : pending.ranges) {
463
+ for (uint32_t i = range.c0; i < range.c1; ++i) {
464
+ cells[i].seq_id.clear();
465
+
466
+ // keep count of the number of used cells
467
+ if (cells[i].pos >= 0) {
468
+ used--;
469
+ }
470
+
471
+ cells[i].pos = -1;
472
+ cells[i].src = -1;
473
+ }
474
+
475
+ new_head = std::min(new_head, range.c0);
476
+ }
477
+
478
+ if (new_head != size && new_head < head) {
479
+ head = new_head;
480
+ }
481
+ }
482
+
483
+ void llama_kv_cache_unified::commit() {
484
+ // TODO: tmp - move to llama_kv_cache_recurrent
485
+ if (recurrent) {
486
+ return;
487
+ }
488
+
489
+ if (pending.ranges.empty()) {
490
+ LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
491
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
492
+ return;
493
+ }
494
+
495
+ pending.ranges.clear();
496
+ }
497
+
498
+ bool llama_kv_cache_unified::get_can_shift() const {
499
+ return can_shift;
500
+ }
501
+
502
+ bool llama_kv_cache_unified::find_slot(
503
+ const llama_ubatch & ubatch) {
123
504
  const uint32_t n_tokens = ubatch.n_tokens;
124
505
  const uint32_t n_seqs = ubatch.n_seqs;
125
506
  const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
126
507
 
127
- if (cache.recurrent) {
508
+ // if we have enough unused cells before the current head ->
509
+ // better to start searching from the beginning of the cache, hoping to fill it
510
+ if (head > used + 2*ubatch.n_tokens) {
511
+ head = 0;
512
+ }
513
+
514
+ if (recurrent) {
128
515
  // For recurrent state architectures (like Mamba or RWKV),
129
516
  // each cache cell can store the state for a whole sequence.
130
517
  // A slot should be always be contiguous.
@@ -132,7 +519,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
132
519
  // can only process batches with an equal number of new tokens in each sequence
133
520
  LM_GGML_ASSERT(ubatch.equal_seqs);
134
521
 
135
- int32_t min = cache.size - 1;
522
+ int32_t min = size - 1;
136
523
  int32_t max = 0;
137
524
 
138
525
  // everything should fit if all seq_ids are smaller than the max
@@ -141,16 +528,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
141
528
  for (uint32_t j = 0; j < n_seq_id; ++j) {
142
529
  const llama_seq_id seq_id = ubatch.seq_id[s][j];
143
530
 
144
- if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
531
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
145
532
  // too big seq_id
146
533
  // TODO: would it be possible to resize the cache instead?
147
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
148
- return llama_kv_cache_slot_info_failed;
534
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
535
+ return false;
149
536
  }
150
537
  if (j > 0) {
151
- llama_kv_cell & seq = cache.cells[seq_id];
538
+ llama_kv_cell & seq = cells[seq_id];
152
539
  if (seq.tail >= 0) {
153
- llama_kv_cell & cell = cache.cells[seq.tail];
540
+ llama_kv_cell & cell = cells[seq.tail];
154
541
  // clear cells from seq_ids that become shared
155
542
  // (should not normally happen, but let's handle it anyway)
156
543
  cell.seq_id.erase(seq_id);
@@ -158,7 +545,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
158
545
  if (cell.seq_id.empty()) {
159
546
  cell.pos = -1;
160
547
  cell.src = -1;
161
- cache.used -= 1;
548
+ used -= 1;
162
549
  }
163
550
  }
164
551
  }
@@ -168,9 +555,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
168
555
  #ifndef NDEBUG
169
556
  {
170
557
  std::vector<int32_t> tails_verif;
171
- tails_verif.assign(cache.size, -1);
172
- for (uint32_t i = 0; i < cache.size; ++i) {
173
- llama_kv_cell & cell = cache.cells[i];
558
+ tails_verif.assign(size, -1);
559
+ for (uint32_t i = 0; i < size; ++i) {
560
+ llama_kv_cell & cell = cells[i];
174
561
  for (llama_seq_id seq_id : cell.seq_id) {
175
562
  if (tails_verif[seq_id] != -1) {
176
563
  LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
@@ -178,20 +565,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
178
565
  tails_verif[seq_id] = i;
179
566
  }
180
567
  }
181
- for (uint32_t i = 0; i < cache.size; ++i) {
182
- if (tails_verif[i] != cache.cells[i].tail) {
183
- LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
568
+ for (uint32_t i = 0; i < size; ++i) {
569
+ if (tails_verif[i] != cells[i].tail) {
570
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
184
571
  }
185
572
  }
186
573
  }
187
574
  #endif
188
575
 
189
576
  // find next empty cell
190
- uint32_t next_empty_cell = cache.head;
577
+ uint32_t next_empty_cell = head;
191
578
 
192
- for (uint32_t i = 0; i < cache.size; ++i) {
193
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
194
- llama_kv_cell & cell = cache.cells[next_empty_cell];
579
+ for (uint32_t i = 0; i < size; ++i) {
580
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
581
+ llama_kv_cell & cell = cells[next_empty_cell];
195
582
  if (cell.is_empty()) { break; }
196
583
  next_empty_cell += 1;
197
584
  }
@@ -199,20 +586,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
199
586
  // find usable cell range
200
587
  for (uint32_t s = 0; s < n_seqs; ++s) {
201
588
  const llama_seq_id seq_id = ubatch.seq_id[s][0];
202
- llama_kv_cell & seq_meta = cache.cells[seq_id];
589
+ llama_kv_cell & seq_meta = cells[seq_id];
203
590
  bool has_cell = false;
204
591
  if (seq_meta.tail >= 0) {
205
- llama_kv_cell & cell = cache.cells[seq_meta.tail];
592
+ llama_kv_cell & cell = cells[seq_meta.tail];
206
593
  LM_GGML_ASSERT(cell.has_seq_id(seq_id));
207
594
  // does this seq_id "own" the cell?
208
595
  if (cell.seq_id.size() == 1) { has_cell = true; }
209
596
  }
210
597
  if (!has_cell) {
211
- llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
598
+ llama_kv_cell & empty_cell = cells[next_empty_cell];
212
599
  LM_GGML_ASSERT(empty_cell.is_empty());
213
600
  // copy old tail into the empty cell
214
601
  if (seq_meta.tail >= 0) {
215
- llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
602
+ llama_kv_cell & orig_cell = cells[seq_meta.tail];
216
603
  empty_cell.pos = orig_cell.pos;
217
604
  empty_cell.src = orig_cell.src;
218
605
  orig_cell.seq_id.erase(seq_id);
@@ -222,9 +609,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
222
609
  // find next empty cell
223
610
  if (s + 1 < n_seqs) {
224
611
  next_empty_cell += 1;
225
- for (uint32_t i = 0; i < cache.size; ++i) {
226
- if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
227
- llama_kv_cell & cell = cache.cells[next_empty_cell];
612
+ for (uint32_t i = 0; i < size; ++i) {
613
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
614
+ llama_kv_cell & cell = cells[next_empty_cell];
228
615
  if (cell.is_empty()) { break; }
229
616
  next_empty_cell += 1;
230
617
  }
@@ -237,10 +624,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
237
624
  // gather and re-order
238
625
  for (uint32_t s = 0; s < n_seqs; ++s) {
239
626
  int32_t dst_id = s + min;
240
- int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
627
+ int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
241
628
  if (dst_id != src_id) {
242
- llama_kv_cell & dst_cell = cache.cells[dst_id];
243
- llama_kv_cell & src_cell = cache.cells[src_id];
629
+ llama_kv_cell & dst_cell = cells[dst_id];
630
+ llama_kv_cell & src_cell = cells[src_id];
244
631
 
245
632
  std::swap(dst_cell.pos, src_cell.pos);
246
633
  std::swap(dst_cell.src, src_cell.src);
@@ -248,10 +635,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
248
635
 
249
636
  // swap tails (assuming they NEVER overlap)
250
637
  for (const llama_seq_id seq_id : src_cell.seq_id) {
251
- cache.cells[seq_id].tail = src_id;
638
+ cells[seq_id].tail = src_id;
252
639
  }
253
640
  for (const llama_seq_id seq_id : dst_cell.seq_id) {
254
- cache.cells[seq_id].tail = dst_id;
641
+ cells[seq_id].tail = dst_id;
255
642
  }
256
643
  }
257
644
  }
@@ -260,7 +647,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
260
647
  for (uint32_t s = 0; s < n_seqs; ++s) {
261
648
  const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
262
649
  int32_t cell_id = s + min;
263
- llama_kv_cell & cell = cache.cells[cell_id];
650
+ llama_kv_cell & cell = cells[cell_id];
264
651
 
265
652
  if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
266
653
  // What should happen when the pos backtracks or skips a value?
@@ -273,41 +660,42 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
273
660
  for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
274
661
  const llama_seq_id seq_id = ubatch.seq_id[s][j];
275
662
  cell.seq_id.insert(seq_id);
276
- cache.cells[seq_id].tail = cell_id;
663
+ cells[seq_id].tail = cell_id;
277
664
  }
278
665
  }
279
666
 
280
667
  // allow getting the range of used cells, from head to head + n
281
- cache.head = min;
282
- cache.n = max - min + 1;
283
- cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
668
+ head = min;
669
+ n = max - min + 1;
670
+ used = std::count_if(cells.begin(), cells.end(),
284
671
  [](const llama_kv_cell& cell){ return !cell.is_empty(); });
285
672
 
286
673
  // sanity check
287
- return llama_kv_cache_slot_info(cache.n >= n_seqs);
674
+ return n >= n_seqs;
288
675
  }
676
+
289
677
  // otherwise, one cell per token.
290
678
 
291
- if (n_tokens > cache.size) {
292
- LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
293
- return llama_kv_cache_slot_info_failed;
679
+ if (n_tokens > size) {
680
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
681
+ return false;
294
682
  }
295
683
 
296
684
  uint32_t n_tested = 0;
297
685
 
298
686
  while (true) {
299
- if (cache.head + n_tokens > cache.size) {
300
- n_tested += cache.size - cache.head;
301
- cache.head = 0;
687
+ if (head + n_tokens > size) {
688
+ n_tested += size - head;
689
+ head = 0;
302
690
  continue;
303
691
  }
304
692
 
305
693
  bool found = true;
306
694
  for (uint32_t i = 0; i < n_tokens; i++) {
307
- if (cache.cells[cache.head + i].pos >= 0) {
695
+ if (cells[head + i].pos >= 0) {
308
696
  found = false;
309
- cache.head += i + 1;
310
- n_tested += i + 1;
697
+ head += i + 1;
698
+ n_tested += i + 1;
311
699
  break;
312
700
  }
313
701
  }
@@ -316,31 +704,38 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
316
704
  break;
317
705
  }
318
706
 
319
- if (n_tested >= cache.size) {
707
+ if (n_tested >= size) {
320
708
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
321
- return llama_kv_cache_slot_info_failed;
709
+ return false;
322
710
  }
323
711
  }
324
712
 
325
713
  for (uint32_t s = 0; s < n_seqs; s++) {
326
714
  for (uint32_t i = 0; i < n_seq_tokens; ++i) {
327
715
  uint32_t k = s*n_seq_tokens + i;
328
- cache.cells[cache.head + k].pos = ubatch.pos[k];
716
+ cells[head + k].pos = ubatch.pos[k];
329
717
 
330
718
  for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
331
- cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
719
+ cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
332
720
  }
333
721
  }
334
722
  }
335
723
 
336
- cache.used += n_tokens;
724
+ used += n_tokens;
725
+
726
+ pending.ranges.push_back({head, head + n_tokens});
727
+
728
+ return true;
729
+ }
337
730
 
338
- return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
731
+ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
732
+ // the FA kernels require padding to avoid extra runtime boundary checks
733
+ return cparams.flash_attn ? 256u : 32u;
339
734
  }
340
735
 
341
- uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
342
- for (uint32_t i = cache.size; i > 0; --i) {
343
- const llama_kv_cell & cell = cache.cells[i - 1];
736
+ uint32_t llama_kv_cache_unified::cell_max() const {
737
+ for (uint32_t i = size; i > 0; --i) {
738
+ const llama_kv_cell & cell = cells[i - 1];
344
739
 
345
740
  if (cell.pos >= 0 && !cell.is_empty()) {
346
741
  return i;
@@ -350,289 +745,549 @@ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
350
745
  return 0;
351
746
  }
352
747
 
353
- void llama_kv_cache_clear(struct llama_kv_cache & cache) {
354
- for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
355
- cache.cells[i].pos = -1;
356
- cache.cells[i].seq_id.clear();
357
- cache.cells[i].src = -1;
358
- cache.cells[i].tail = -1;
748
+ size_t llama_kv_cache_unified::size_k_bytes() const {
749
+ size_t size_k_bytes = 0;
750
+
751
+ for (const auto & k : k_l) {
752
+ size_k_bytes += lm_ggml_nbytes(k);
359
753
  }
360
- cache.head = 0;
361
- cache.used = 0;
362
754
 
363
- for (auto & buf : cache.bufs) {
364
- lm_ggml_backend_buffer_clear(buf.get(), 0);
755
+ return size_k_bytes;
756
+ }
757
+
758
+ size_t llama_kv_cache_unified::size_v_bytes() const {
759
+ size_t size_v_bytes = 0;
760
+
761
+ for (const auto & v : v_l) {
762
+ size_v_bytes += lm_ggml_nbytes(v);
365
763
  }
764
+
765
+ return size_v_bytes;
366
766
  }
367
767
 
368
- bool llama_kv_cache_seq_rm(
369
- struct llama_kv_cache & cache,
370
- llama_seq_id seq_id,
371
- llama_pos p0,
372
- llama_pos p1) {
373
- uint32_t new_head = cache.size;
768
+ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
769
+ const uint32_t n_layer = hparams.n_layer;
374
770
 
375
- if (p0 < 0) p0 = 0;
376
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
771
+ const uint32_t n_kv = cell_max();
772
+ const uint32_t n_used = used;
377
773
 
378
- // models like Mamba or RWKV can't have a state partially erased
379
- if (cache.recurrent) {
380
- if (seq_id >= (int64_t) cache.size) {
381
- // could be fatal
382
- return false;
774
+ assert(n_used <= n_kv);
775
+
776
+ //const int64_t t_start = lm_ggml_time_us();
777
+
778
+ // number of cells moved
779
+ uint32_t n_moves = 0;
780
+
781
+ // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
782
+ // - source view, destination view, copy operation
783
+ // - x2 for keys and values
784
+ //const uint32_t max_moves = max_nodes()/(6*n_layer);
785
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
786
+ const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
787
+
788
+ // determine which KV cells to move where
789
+ //
790
+ // cell i moves to ids[i]
791
+ //
792
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
793
+ //
794
+ auto & ids = defrag_info.ids;
795
+
796
+ ids.clear();
797
+ ids.resize(n_kv, n_kv);
798
+
799
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
800
+ const auto & cell0 = cells[i0];
801
+
802
+ if (!cell0.is_empty()) {
803
+ ids[i0] = i0;
804
+
805
+ continue;
383
806
  }
384
- if (0 <= seq_id) {
385
- int32_t & tail_id = cache.cells[seq_id].tail;
386
- if (tail_id >= 0) {
387
- const llama_kv_cell & cell = cache.cells[tail_id];
388
- // partial intersection is invalid
389
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
390
- return false;
391
- }
392
- // invalidate tails which will be cleared
393
- if (p0 <= cell.pos && cell.pos < p1) {
394
- tail_id = -1;
395
- }
396
- }
397
- } else {
398
- // seq_id is negative, then the range should include everything or nothing
399
- if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
400
- return false;
401
- }
807
+
808
+ // found a hole - fill it with data from the end of the cache
809
+
810
+ uint32_t nh = 1;
811
+
812
+ // determine the size of the hole
813
+ while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
814
+ nh++;
402
815
  }
403
- }
404
816
 
405
- for (uint32_t i = 0; i < cache.size; ++i) {
406
- if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
407
- if (seq_id < 0) {
408
- cache.cells[i].seq_id.clear();
409
- } else if (cache.cells[i].has_seq_id(seq_id)) {
410
- cache.cells[i].seq_id.erase(seq_id);
411
- } else {
817
+ uint32_t nf = 0;
818
+ uint32_t is = n_kv - 1;
819
+
820
+ // starting from the end, find nh non-empty cells
821
+ for (; is > i0; --is) {
822
+ const auto & cell1 = cells[is];
823
+
824
+ if (cell1.is_empty() || ids[is] != n_kv) {
412
825
  continue;
413
826
  }
414
- if (cache.cells[i].is_empty()) {
415
- // keep count of the number of used cells
416
- if (cache.cells[i].pos >= 0) cache.used--;
417
827
 
418
- cache.cells[i].pos = -1;
419
- cache.cells[i].src = -1;
420
- if (new_head == cache.size) new_head = i;
828
+ // non-empty cell which is not yet moved
829
+ nf++;
830
+
831
+ if (nf == nh) {
832
+ break;
421
833
  }
422
834
  }
423
- }
424
835
 
425
- // If we freed up a slot, set head to it so searching can start there.
426
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
836
+ // this can only happen if `n_used` is not accurate, which would be a bug
837
+ LM_GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
427
838
 
428
- return true;
429
- }
839
+ nf = 0;
430
840
 
431
- void llama_kv_cache_seq_cp(
432
- struct llama_kv_cache & cache,
433
- llama_seq_id seq_id_src,
434
- llama_seq_id seq_id_dst,
435
- llama_pos p0,
436
- llama_pos p1) {
437
- if (p0 < 0) p0 = 0;
438
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
841
+ uint32_t i1 = is;
439
842
 
440
- if (cache.recurrent) {
441
- if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
442
- llama_kv_cell & tail_src = cache.cells[seq_id_src];
443
- llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
444
- if (tail_dst.tail >= 0) {
445
- // clear destination seq_id if it wasn't empty
446
- llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
843
+ // are we moving a continuous block of memory?
844
+ bool cont = false;
447
845
 
448
- cell_dst.seq_id.erase(seq_id_dst);
449
- tail_dst.tail = -1;
450
- if (cell_dst.seq_id.empty()) {
451
- cell_dst.pos = -1;
452
- cell_dst.delta = -1;
453
- cell_dst.src = -1;
454
- cache.used -= 1;
846
+ // should we stop searching for the next move?
847
+ bool stop = false;
848
+
849
+ // go back and move the nf cells to the hole
850
+ for (; i1 < n_kv; ++i1) {
851
+ auto & cell1 = cells[i1];
852
+
853
+ if (cell1.is_empty() || ids[i1] != n_kv) {
854
+ if (n_moves == max_moves) {
855
+ stop = true;
856
+ break;
455
857
  }
858
+
859
+ cont = false;
860
+ continue;
456
861
  }
457
- if (tail_src.tail >= 0) {
458
- llama_kv_cell & cell_src = cache.cells[tail_src.tail];
459
862
 
460
- cell_src.seq_id.insert(seq_id_dst);
461
- tail_dst.tail = tail_src.tail;
863
+ // this cell goes to (i0 + nf)
864
+ ids[i1] = i0 + nf;
865
+
866
+ // move the cell meta data
867
+ cells[i0 + nf] = cell1;
868
+
869
+ // clear the old cell and move the head there
870
+ cell1 = llama_kv_cell();
871
+ head = n_used;
872
+
873
+ if (!cont) {
874
+ n_moves++;
875
+ cont = true;
876
+ }
877
+
878
+ nf++;
879
+
880
+ if (nf == nh) {
881
+ break;
462
882
  }
463
883
  }
464
884
 
465
- return;
885
+ if (stop || n_moves == max_moves) {
886
+ break;
887
+ }
888
+
889
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
890
+
891
+ i0 += nh - 1;
892
+ }
893
+
894
+ if (n_moves == 0) {
895
+ return false;
466
896
  }
467
- // otherwise, this is the KV cache of a Transformer-like model
468
897
 
469
- cache.head = 0;
898
+ LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
470
899
 
471
- for (uint32_t i = 0; i < cache.size; ++i) {
472
- if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
473
- cache.cells[i].seq_id.insert(seq_id_dst);
900
+ LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
901
+
902
+ return true;
903
+ }
904
+
905
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
906
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
907
+ uint32_t cell_count = 0;
908
+
909
+ // Count the number of cells with the specified seq_id
910
+ // Find all the ranges of cells with this seq id (or all, when -1)
911
+ uint32_t cell_range_begin = size;
912
+ for (uint32_t i = 0; i < size; ++i) {
913
+ const auto & cell = cells[i];
914
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
915
+ ++cell_count;
916
+ if (cell_range_begin == size) {
917
+ cell_range_begin = i;
918
+ }
919
+ } else {
920
+ if (cell_range_begin != size) {
921
+ cell_ranges.emplace_back(cell_range_begin, i);
922
+ cell_range_begin = size;
923
+ }
474
924
  }
475
925
  }
926
+ if (cell_range_begin != size) {
927
+ cell_ranges.emplace_back(cell_range_begin, size);
928
+ }
929
+
930
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
931
+ uint32_t cell_count_check = 0;
932
+ for (const auto & range : cell_ranges) {
933
+ cell_count_check += range.second - range.first;
934
+ }
935
+ LM_GGML_ASSERT(cell_count == cell_count_check);
936
+
937
+ io.write(&cell_count, sizeof(cell_count));
938
+
939
+ state_write_meta(io, cell_ranges, seq_id);
940
+ state_write_data(io, cell_ranges);
476
941
  }
477
942
 
478
- void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
479
- uint32_t new_head = cache.size;
943
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
944
+ uint32_t cell_count;
945
+ io.read_to(&cell_count, sizeof(cell_count));
480
946
 
481
- for (uint32_t i = 0; i < cache.size; ++i) {
482
- if (cache.recurrent && (llama_seq_id) i != seq_id) {
483
- cache.cells[i].tail = -1;
484
- }
485
- if (!cache.cells[i].has_seq_id(seq_id)) {
486
- if (cache.cells[i].pos >= 0) cache.used--;
487
- cache.cells[i].pos = -1;
488
- cache.cells[i].src = -1;
489
- cache.cells[i].seq_id.clear();
490
- if (new_head == cache.size) new_head = i;
947
+ bool res = true;
948
+ res = res && state_read_meta(io, cell_count, seq_id);
949
+ res = res && state_read_data(io, cell_count);
950
+
951
+ if (!res) {
952
+ if (seq_id == -1) {
953
+ clear();
491
954
  } else {
492
- cache.cells[i].seq_id.clear();
493
- cache.cells[i].seq_id.insert(seq_id);
955
+ seq_rm(seq_id, -1, -1);
494
956
  }
957
+ throw std::runtime_error("failed to restore kv cache");
495
958
  }
496
-
497
- // If we freed up a slot, set head to it so searching can start there.
498
- if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
499
959
  }
500
960
 
501
- void llama_kv_cache_seq_add(
502
- struct llama_kv_cache & cache,
503
- llama_seq_id seq_id,
504
- llama_pos p0,
505
- llama_pos p1,
506
- llama_pos delta) {
507
- uint32_t new_head = cache.size;
961
+ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
962
+ for (const auto & range : cell_ranges) {
963
+ for (uint32_t i = range.first; i < range.second; ++i) {
964
+ const auto & cell = cells[i];
965
+ const llama_pos pos = cell.pos;
966
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
508
967
 
509
- if (p0 < 0) p0 = 0;
510
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
511
- // If there is no range then return early to avoid looping over the cache.
512
- if (p0 == p1) return;
968
+ io.write(&pos, sizeof(pos));
969
+ io.write(&n_seq_id, sizeof(n_seq_id));
513
970
 
514
- if (cache.recurrent) {
515
- // for Mamba-like or RWKV models, only the pos needs to be shifted
516
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
517
- const int32_t tail_id = cache.cells[seq_id].tail;
518
- if (tail_id >= 0) {
519
- llama_kv_cell & cell = cache.cells[tail_id];
520
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
521
- cell.pos += delta;
971
+ if (n_seq_id) {
972
+ for (auto seq_id : cell.seq_id) {
973
+ io.write(&seq_id, sizeof(seq_id));
522
974
  }
523
975
  }
524
976
  }
525
- return;
526
977
  }
978
+ }
527
979
 
528
- for (uint32_t i = 0; i < cache.size; ++i) {
529
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
530
- cache.has_shift = true;
531
- cache.cells[i].pos += delta;
532
- cache.cells[i].delta += delta;
980
+ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
981
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
982
+ const uint32_t n_layer = hparams.n_layer;
533
983
 
534
- if (cache.cells[i].pos < 0) {
535
- if (!cache.cells[i].is_empty()) {
536
- cache.used--;
537
- }
538
- cache.cells[i].pos = -1;
539
- cache.cells[i].seq_id.clear();
540
- if (new_head == cache.size) {
541
- new_head = i;
542
- }
543
- }
984
+ io.write(&v_trans, sizeof(v_trans));
985
+ io.write(&n_layer, sizeof(n_layer));
986
+
987
+ std::vector<uint8_t> tmp_buf;
988
+
989
+ // Iterate and write all the keys first, each row is a cell
990
+ // Get whole range at a time
991
+ for (uint32_t il = 0; il < n_layer; ++il) {
992
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
993
+
994
+ // Write key type
995
+ const int32_t k_type_i = (int32_t)k_l[il]->type;
996
+ io.write(&k_type_i, sizeof(k_type_i));
997
+
998
+ // Write row size of key
999
+ const uint64_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1000
+ io.write(&k_size_row, sizeof(k_size_row));
1001
+
1002
+ // Read each range of cells of k_size length each into tmp_buf and write out
1003
+ for (const auto & range : cell_ranges) {
1004
+ const size_t range_size = range.second - range.first;
1005
+ const size_t buf_size = range_size * k_size_row;
1006
+ io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
544
1007
  }
545
1008
  }
546
1009
 
547
- // If we freed up a slot, set head to it so searching can start there.
548
- // Otherwise we just start the next search from the beginning.
549
- cache.head = new_head != cache.size ? new_head : 0;
550
- }
1010
+ if (!v_trans) {
1011
+ for (uint32_t il = 0; il < n_layer; ++il) {
1012
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
551
1013
 
552
- void llama_kv_cache_seq_div(
553
- struct llama_kv_cache & cache,
554
- llama_seq_id seq_id,
555
- llama_pos p0,
556
- llama_pos p1,
557
- int d) {
558
- if (p0 < 0) p0 = 0;
559
- if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
560
- // If there is no range then return early to avoid looping over the cache.
561
- if (p0 == p1) return;
1014
+ // Write value type
1015
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1016
+ io.write(&v_type_i, sizeof(v_type_i));
562
1017
 
563
- if (cache.recurrent) {
564
- // for Mamba-like or RWKV models, only the pos needs to be changed
565
- if (0 <= seq_id && seq_id < (int64_t) cache.size) {
566
- const int32_t tail_id = cache.cells[seq_id].tail;
567
- if (tail_id >= 0) {
568
- llama_kv_cell & cell = cache.cells[tail_id];
569
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
570
- cell.pos /= d;
1018
+ // Write row size of value
1019
+ const uint64_t v_size_row = lm_ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1020
+ io.write(&v_size_row, sizeof(v_size_row));
1021
+
1022
+ // Read each range of cells of v_size length each into tmp_buf and write out
1023
+ for (const auto & range : cell_ranges) {
1024
+ const size_t range_size = range.second - range.first;
1025
+ const size_t buf_size = range_size * v_size_row;
1026
+ io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
1027
+ }
1028
+ }
1029
+ } else {
1030
+ // When v is transposed, we also need the element size and get the element ranges from each row
1031
+ const uint32_t kv_size = size;
1032
+ for (uint32_t il = 0; il < n_layer; ++il) {
1033
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1034
+
1035
+ // Write value type
1036
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1037
+ io.write(&v_type_i, sizeof(v_type_i));
1038
+
1039
+ // Write element size
1040
+ const uint32_t v_size_el = lm_ggml_type_size(v_l[il]->type);
1041
+ io.write(&v_size_el, sizeof(v_size_el));
1042
+
1043
+ // Write GQA embedding size
1044
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1045
+
1046
+ // For each row, we get the element values of each cell
1047
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1048
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1049
+ for (const auto & range : cell_ranges) {
1050
+ const size_t range_size = range.second - range.first;
1051
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1052
+ const size_t buf_size = range_size * v_size_el;
1053
+ io.write_tensor(v_l[il], src_offset, buf_size);
571
1054
  }
572
1055
  }
573
1056
  }
574
- return;
575
1057
  }
1058
+ }
576
1059
 
577
- for (uint32_t i = 0; i < cache.size; ++i) {
578
- if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
579
- cache.has_shift = true;
1060
+ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1061
+ if (dest_seq_id != -1) {
1062
+ // single sequence
580
1063
 
581
- {
582
- llama_pos p_old = cache.cells[i].pos;
583
- cache.cells[i].pos /= d;
584
- cache.cells[i].delta += cache.cells[i].pos - p_old;
1064
+ seq_rm(dest_seq_id, -1, -1);
1065
+
1066
+ llama_sbatch sbatch;
1067
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1068
+
1069
+ batch.n_tokens = cell_count;
1070
+ batch.n_seq_tokens = cell_count;
1071
+ batch.n_seqs = 1;
1072
+
1073
+ for (uint32_t i = 0; i < cell_count; ++i) {
1074
+ llama_pos pos;
1075
+ uint32_t n_seq_id;
1076
+
1077
+ io.read_to(&pos, sizeof(pos));
1078
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1079
+
1080
+ if (n_seq_id != 0) {
1081
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1082
+ return false;
585
1083
  }
1084
+
1085
+ batch.pos[i] = pos;
1086
+ }
1087
+ batch.n_seq_id[0] = 1;
1088
+ batch.seq_id[0] = &dest_seq_id;
1089
+ if (!find_slot(batch)) {
1090
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1091
+ return false;
1092
+ }
1093
+ commit();
1094
+
1095
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
1096
+ // Assume that this is one contiguous block of cells
1097
+ LM_GGML_ASSERT(head + cell_count <= size);
1098
+ LM_GGML_ASSERT(cells[head].pos == batch.pos[0]);
1099
+ LM_GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
1100
+ LM_GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
1101
+ LM_GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
1102
+ } else {
1103
+ // whole KV cache restore
1104
+
1105
+ if (cell_count > size) {
1106
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1107
+ return false;
586
1108
  }
587
- }
588
- }
589
1109
 
590
- llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
591
- llama_pos result = 0;
1110
+ clear();
1111
+
1112
+ for (uint32_t i = 0; i < cell_count; ++i) {
1113
+ llama_kv_cell & cell = cells[i];
1114
+
1115
+ llama_pos pos;
1116
+ uint32_t n_seq_id;
1117
+
1118
+ io.read_to(&pos, sizeof(pos));
1119
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1120
+
1121
+ cell.pos = pos;
592
1122
 
593
- for (uint32_t i = 0; i < cache.size; ++i) {
594
- if (cache.cells[i].has_seq_id(seq_id)) {
595
- result = std::max(result, cache.cells[i].pos);
1123
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1124
+ llama_seq_id seq_id;
1125
+ io.read_to(&seq_id, sizeof(seq_id));
1126
+
1127
+ // TODO: llama_kv_cache_unified should have a notion of max sequences
1128
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1129
+ if (seq_id < 0) {
1130
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1131
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
1132
+ return false;
1133
+ }
1134
+
1135
+ cell.seq_id.insert(seq_id);
1136
+
1137
+ if (recurrent) {
1138
+ int32_t & tail = cells[seq_id].tail;
1139
+ if (tail != -1) {
1140
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
1141
+ return false;
1142
+ }
1143
+ tail = i;
1144
+ }
1145
+ }
596
1146
  }
1147
+
1148
+ head = 0;
1149
+ used = cell_count;
597
1150
  }
598
1151
 
599
- return result;
1152
+ if (recurrent) {
1153
+ for (uint32_t i = 0; i < cell_count; ++i) {
1154
+ uint32_t cell_id = head + i;
1155
+ // make sure the recurrent states will keep their restored state
1156
+ cells[cell_id].src = cell_id;
1157
+ }
1158
+ }
1159
+
1160
+ return true;
600
1161
  }
601
1162
 
602
- void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
603
- if (!cache.recurrent) {
604
- cache.do_defrag = true;
1163
+ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1164
+ uint32_t v_trans;
1165
+ uint32_t n_layer;
1166
+ io.read_to(&v_trans, sizeof(v_trans));
1167
+ io.read_to(&n_layer, sizeof(n_layer));
1168
+
1169
+ if (n_layer != hparams.n_layer) {
1170
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
1171
+ return false;
1172
+ }
1173
+ if (cell_count > size) {
1174
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1175
+ return false;
1176
+ }
1177
+ if (v_trans != (bool) v_trans) {
1178
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1179
+ return false;
605
1180
  }
606
- }
607
1181
 
608
- int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
609
- int result = 0;
1182
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1183
+ for (uint32_t il = 0; il < n_layer; ++il) {
1184
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
610
1185
 
611
- for (uint32_t i = 0; i < kv.size; i++) {
612
- result += kv.cells[i].seq_id.size();
1186
+ // Read type of key
1187
+ int32_t k_type_i_ref;
1188
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1189
+ const int32_t k_type_i = (int32_t) k_l[il]->type;
1190
+ if (k_type_i != k_type_i_ref) {
1191
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1192
+ return false;
1193
+ }
1194
+
1195
+ // Read row size of key
1196
+ uint64_t k_size_row_ref;
1197
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1198
+ const size_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1199
+ if (k_size_row != k_size_row_ref) {
1200
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1201
+ return false;
1202
+ }
1203
+
1204
+ if (cell_count) {
1205
+ // Read and set the keys for the whole cell range
1206
+ lm_ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1207
+ }
613
1208
  }
614
1209
 
615
- return result;
616
- }
1210
+ if (!v_trans) {
1211
+ for (uint32_t il = 0; il < n_layer; ++il) {
1212
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
617
1213
 
618
- int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
619
- return kv.used;
620
- }
1214
+ // Read type of value
1215
+ int32_t v_type_i_ref;
1216
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1217
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1218
+ if (v_type_i != v_type_i_ref) {
1219
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1220
+ return false;
1221
+ }
1222
+
1223
+ // Read row size of value
1224
+ uint64_t v_size_row_ref;
1225
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1226
+ const size_t v_size_row = lm_ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1227
+ if (v_size_row != v_size_row_ref) {
1228
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1229
+ return false;
1230
+ }
621
1231
 
622
- bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
623
- return kv.can_shift;
1232
+ if (cell_count) {
1233
+ // Read and set the values for the whole cell range
1234
+ lm_ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1235
+ }
1236
+ }
1237
+ } else {
1238
+ // For each layer, read the values for each cell (transposed)
1239
+ for (uint32_t il = 0; il < n_layer; ++il) {
1240
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1241
+
1242
+ // Read type of value
1243
+ int32_t v_type_i_ref;
1244
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1245
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1246
+ if (v_type_i != v_type_i_ref) {
1247
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1248
+ return false;
1249
+ }
1250
+
1251
+ // Read element size of value
1252
+ uint32_t v_size_el_ref;
1253
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1254
+ const size_t v_size_el = lm_ggml_type_size(v_l[il]->type);
1255
+ if (v_size_el != v_size_el_ref) {
1256
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1257
+ return false;
1258
+ }
1259
+
1260
+ // Read GQA embedding size
1261
+ uint32_t n_embd_v_gqa_ref;
1262
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1263
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1264
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1265
+ return false;
1266
+ }
1267
+
1268
+ if (cell_count) {
1269
+ // For each row in the transposed matrix, read the values for the whole cell range
1270
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1271
+ const size_t dst_offset = (head + j * size) * v_size_el;
1272
+ lm_ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1273
+ }
1274
+ }
1275
+ }
1276
+ }
1277
+
1278
+ return true;
624
1279
  }
625
1280
 
626
1281
  //
627
1282
  // kv cache view
628
1283
  //
629
1284
 
630
- struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
631
- struct llama_kv_cache_view result = {
1285
+ llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
1286
+ llama_kv_cache_view result = {
632
1287
  /*.n_cells = */ 0,
633
1288
  /*.n_seq_max = */ n_seq_max,
634
1289
  /*.token_count = */ 0,
635
- /*.used_cells = */ llama_get_kv_cache_used_cells(kv),
1290
+ /*.used_cells = */ kv.get_used_cells(),
636
1291
  /*.max_contiguous = */ 0,
637
1292
  /*.max_contiguous_idx = */ -1,
638
1293
  /*.cells = */ nullptr,
@@ -642,7 +1297,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache
642
1297
  return result;
643
1298
  }
644
1299
 
645
- void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
1300
+ void llama_kv_cache_view_free(llama_kv_cache_view * view) {
646
1301
  if (view->cells != nullptr) {
647
1302
  free(view->cells);
648
1303
  view->cells = nullptr;
@@ -653,18 +1308,25 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
653
1308
  }
654
1309
  }
655
1310
 
656
- void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
657
- if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
658
- view->n_cells = int32_t(kv.size);
659
- void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
1311
+ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
1312
+ // TODO: rework this in the future, for now quick hack
1313
+ const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
1314
+ if (kvu == nullptr) {
1315
+ LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
1316
+ return;
1317
+ }
1318
+
1319
+ if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
1320
+ view->n_cells = int32_t(kvu->size);
1321
+ void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
660
1322
  LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
661
- view->cells = (struct llama_kv_cache_view_cell *)p;
1323
+ view->cells = (llama_kv_cache_view_cell *)p;
662
1324
  p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
663
1325
  LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
664
1326
  view->cells_sequences = (llama_seq_id *)p;
665
1327
  }
666
1328
 
667
- const std::vector<llama_kv_cell> & kv_cells = kv.cells;
1329
+ const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
668
1330
  llama_kv_cache_view_cell * c_curr = view->cells;
669
1331
  llama_seq_id * cs_curr = view->cells_sequences;
670
1332
  int32_t used_cells = 0;
@@ -673,7 +1335,7 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct
673
1335
  uint32_t max_contig = 0;
674
1336
  int32_t max_contig_idx = -1;
675
1337
 
676
- for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
1338
+ for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
677
1339
  const size_t curr_size = kv_cells[i].seq_id.size();
678
1340
  token_count += curr_size;
679
1341
  c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
@@ -711,8 +1373,8 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct
711
1373
  view->max_contiguous_idx = max_contig_idx;
712
1374
  view->token_count = token_count;
713
1375
  view->used_cells = used_cells;
714
- if (uint32_t(used_cells) != kv.used) {
1376
+ if (uint32_t(used_cells) != kvu->used) {
715
1377
  LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
716
- __func__, kv.used, used_cells);
1378
+ __func__, kvu->used, used_cells);
717
1379
  }
718
1380
  }