cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -4,43 +4,40 @@
4
4
  #include "llama-batch.h"
5
5
  #include "llama-cparams.h"
6
6
  #include "llama-model.h"
7
+ #include "llama-context.h"
7
8
 
8
9
  #include <algorithm>
9
10
  #include <cassert>
11
+ #include <cmath>
10
12
  #include <limits>
11
13
  #include <map>
12
14
  #include <stdexcept>
13
15
 
14
- llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
15
- }
16
-
17
- bool llama_kv_cache_unified::init(
18
- const llama_model & model,
19
- const llama_cparams & cparams,
20
- lm_ggml_type type_k,
21
- lm_ggml_type type_v,
22
- uint32_t kv_size,
23
- bool offload) {
24
- const int32_t n_layer = hparams.n_layer;
25
-
26
- has_shift = false;
27
-
28
- recurrent = llama_model_is_recurrent(&model);
29
- v_trans = !recurrent && !cparams.flash_attn;
30
- can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
31
-
32
- LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
33
- __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer, can_shift);
34
-
35
- head = 0;
36
- size = kv_size;
37
- used = 0;
16
+ //
17
+ // llama_kv_cache_unified
18
+ //
38
19
 
39
- this->type_k = type_k;
40
- this->type_v = type_v;
20
+ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
21
+ // the FA kernels require padding to avoid extra runtime boundary checks
22
+ return cparams.flash_attn ? 256u : 32u;
23
+ }
41
24
 
42
- cells.clear();
43
- cells.resize(kv_size);
25
+ llama_kv_cache_unified::llama_kv_cache_unified(
26
+ const llama_model & model,
27
+ layer_filter_cb && filter,
28
+ lm_ggml_type type_k,
29
+ lm_ggml_type type_v,
30
+ bool v_trans,
31
+ bool offload,
32
+ uint32_t kv_size,
33
+ uint32_t n_seq_max,
34
+ uint32_t n_pad,
35
+ uint32_t n_swa,
36
+ llama_swa_type swa_type) :
37
+ model(model), hparams(model.hparams), v_trans(v_trans),
38
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
39
+
40
+ LM_GGML_ASSERT(kv_size % n_pad == 0);
44
41
 
45
42
  // create a context for each buffer type
46
43
  std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
@@ -48,7 +45,7 @@ bool llama_kv_cache_unified::init(
48
45
  auto it = ctx_map.find(buft);
49
46
  if (it == ctx_map.end()) {
50
47
  lm_ggml_init_params params = {
51
- /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
48
+ /*.mem_size =*/ size_t(2u*hparams.n_layer*lm_ggml_tensor_overhead()),
52
49
  /*.mem_buffer =*/ NULL,
53
50
  /*.no_alloc =*/ true,
54
51
  };
@@ -67,40 +64,50 @@ bool llama_kv_cache_unified::init(
67
64
  return it->second;
68
65
  };
69
66
 
70
- k_l.reserve(n_layer);
71
- v_l.reserve(n_layer);
67
+ head = 0;
68
+ size = kv_size;
69
+ used = 0;
72
70
 
73
- for (int i = 0; i < n_layer; i++) {
74
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
75
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
71
+ cells.resize(kv_size);
72
+
73
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
74
+ if (filter && !filter(il)) {
75
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
76
+ continue;
77
+ }
78
+
79
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
80
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
76
81
 
77
82
  const char * dev_name = "CPU";
78
83
 
79
- lm_ggml_backend_buffer_type_t buft;
84
+ lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_cpu_buffer_type();
85
+
80
86
  if (offload) {
81
- auto * dev = model.dev_layer(i);
87
+ auto * dev = model.dev_layer(il);
82
88
  buft = lm_ggml_backend_dev_buffer_type(dev);
83
89
 
84
90
  dev_name = lm_ggml_backend_dev_name(dev);
85
- } else {
86
- buft = lm_ggml_backend_cpu_buffer_type();
87
91
  }
88
92
 
89
- LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
90
- i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
93
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
91
94
 
92
95
  lm_ggml_context * ctx = ctx_for_buft(buft);
93
96
  if (!ctx) {
94
- LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
95
- return false;
97
+ throw std::runtime_error("failed to create ggml context for kv cache");
96
98
  }
97
99
 
98
- lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
99
- lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
100
- lm_ggml_format_name(k, "cache_k_l%d", i);
101
- lm_ggml_format_name(v, "cache_v_l%d", i);
102
- k_l.push_back(k);
103
- v_l.push_back(v);
100
+ lm_ggml_tensor * k;
101
+ lm_ggml_tensor * v;
102
+
103
+ k = lm_ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
104
+ v = lm_ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
105
+
106
+ lm_ggml_format_name(k, "cache_k_l%d", il);
107
+ lm_ggml_format_name(v, "cache_v_l%d", il);
108
+
109
+ map_layer_ids[il] = layers.size();
110
+ layers.push_back({ il, k, v });
104
111
  }
105
112
 
106
113
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -110,56 +117,32 @@ bool llama_kv_cache_unified::init(
110
117
 
111
118
  lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
112
119
  if (!buf) {
113
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
114
- return false;
120
+ throw std::runtime_error("failed to allocate buffer for kv cache");
115
121
  }
116
- lm_ggml_backend_buffer_clear(buf, 0);
117
- LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
118
- bufs.emplace_back(buf);
119
- }
120
-
121
- return true;
122
- }
123
-
124
- int32_t llama_kv_cache_unified::get_n_tokens() const {
125
- int32_t result = 0;
126
-
127
- for (uint32_t i = 0; i < size; i++) {
128
- result += cells[i].seq_id.size();
129
- }
130
122
 
131
- return result;
132
- }
133
-
134
- int32_t llama_kv_cache_unified::get_used_cells() const {
135
- return used;
136
- }
123
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
137
124
 
138
- size_t llama_kv_cache_unified::total_size() const {
139
- size_t size = 0;
140
- for (const auto & buf : bufs) {
141
- size += lm_ggml_backend_buffer_get_size(buf.get());
125
+ lm_ggml_backend_buffer_clear(buf, 0);
126
+ bufs.emplace_back(buf);
142
127
  }
143
128
 
144
- return size;
145
- }
129
+ {
130
+ const size_t memory_size_k = size_k_bytes();
131
+ const size_t memory_size_v = size_v_bytes();
146
132
 
147
- llama_pos llama_kv_cache_unified::pos_max() const {
148
- llama_pos pos_max = -1;
149
- for (const auto & cell : cells) {
150
- pos_max = std::max(pos_max, cell.pos);
133
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
134
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
135
+ lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
136
+ lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
151
137
  }
152
-
153
- return pos_max;
154
138
  }
155
139
 
156
140
  void llama_kv_cache_unified::clear() {
157
- for (int32_t i = 0; i < (int32_t) size; ++i) {
141
+ for (uint32_t i = 0; i < size; ++i) {
158
142
  cells[i].pos = -1;
159
143
  cells[i].seq_id.clear();
160
- cells[i].src = -1;
161
- cells[i].tail = -1;
162
144
  }
145
+
163
146
  head = 0;
164
147
  used = 0;
165
148
 
@@ -179,35 +162,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
179
162
  p1 = std::numeric_limits<llama_pos>::max();
180
163
  }
181
164
 
182
- // models like Mamba or RWKV can't have a state partially erased
183
- if (recurrent) {
184
- if (seq_id >= (int64_t) size) {
185
- // could be fatal
186
- return false;
187
- }
188
- if (0 <= seq_id) {
189
- int32_t & tail_id = cells[seq_id].tail;
190
- if (tail_id >= 0) {
191
- const llama_kv_cell & cell = cells[tail_id];
192
- // partial intersection is invalid
193
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
194
- return false;
195
- }
196
- // invalidate tails which will be cleared
197
- if (p0 <= cell.pos && cell.pos < p1) {
198
- tail_id = -1;
199
- }
200
- }
201
- } else {
202
- // seq_id is negative, then the range should include everything or nothing
203
- if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
204
- return false;
205
- }
206
- }
207
-
208
- return true;
209
- }
210
-
211
165
  for (uint32_t i = 0; i < size; ++i) {
212
166
  if (cells[i].pos >= p0 && cells[i].pos < p1) {
213
167
  if (seq_id < 0) {
@@ -217,6 +171,7 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
217
171
  } else {
218
172
  continue;
219
173
  }
174
+
220
175
  if (cells[i].is_empty()) {
221
176
  // keep count of the number of used cells
222
177
  if (cells[i].pos >= 0) {
@@ -224,7 +179,6 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
224
179
  }
225
180
 
226
181
  cells[i].pos = -1;
227
- cells[i].src = -1;
228
182
 
229
183
  if (new_head == size) {
230
184
  new_head = i;
@@ -254,34 +208,6 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
254
208
  p1 = std::numeric_limits<llama_pos>::max();
255
209
  }
256
210
 
257
- if (recurrent) {
258
- if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
259
- llama_kv_cell & tail_src = cells[seq_id_src];
260
- llama_kv_cell & tail_dst = cells[seq_id_dst];
261
- if (tail_dst.tail >= 0) {
262
- // clear destination seq_id if it wasn't empty
263
- llama_kv_cell & cell_dst = cells[tail_dst.tail];
264
-
265
- cell_dst.seq_id.erase(seq_id_dst);
266
- tail_dst.tail = -1;
267
- if (cell_dst.seq_id.empty()) {
268
- cell_dst.pos = -1;
269
- cell_dst.delta = -1;
270
- cell_dst.src = -1;
271
- used -= 1;
272
- }
273
- }
274
- if (tail_src.tail >= 0) {
275
- llama_kv_cell & cell_src = cells[tail_src.tail];
276
-
277
- cell_src.seq_id.insert(seq_id_dst);
278
- tail_dst.tail = tail_src.tail;
279
- }
280
- }
281
-
282
- return;
283
- }
284
-
285
211
  // otherwise, this is the KV of a Transformer-like model
286
212
  head = 0;
287
213
 
@@ -296,17 +222,12 @@ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
296
222
  uint32_t new_head = size;
297
223
 
298
224
  for (uint32_t i = 0; i < size; ++i) {
299
- if (recurrent && (llama_seq_id) i != seq_id) {
300
- cells[i].tail = -1;
301
- }
302
-
303
225
  if (!cells[i].has_seq_id(seq_id)) {
304
226
  if (cells[i].pos >= 0) {
305
227
  used--;
306
228
  }
307
229
 
308
230
  cells[i].pos = -1;
309
- cells[i].src = -1;
310
231
  cells[i].seq_id.clear();
311
232
 
312
233
  if (new_head == size){
@@ -344,23 +265,10 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
344
265
  return;
345
266
  }
346
267
 
347
- if (recurrent) {
348
- // for Mamba-like or RWKV models, only the pos needs to be shifted
349
- if (0 <= seq_id && seq_id < (int64_t) size) {
350
- const int32_t tail_id = cells[seq_id].tail;
351
- if (tail_id >= 0) {
352
- llama_kv_cell & cell = cells[tail_id];
353
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
354
- cell.pos += delta;
355
- }
356
- }
357
- }
358
- return;
359
- }
360
-
361
268
  for (uint32_t i = 0; i < size; ++i) {
362
269
  if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
363
270
  has_shift = true;
271
+
364
272
  cells[i].pos += delta;
365
273
  cells[i].delta += delta;
366
274
 
@@ -400,21 +308,6 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
400
308
  return;
401
309
  }
402
310
 
403
- if (recurrent) {
404
- // for Mamba-like or RWKV models, only the pos needs to be changed
405
- if (0 <= seq_id && seq_id < (int64_t) size) {
406
- const int32_t tail_id = cells[seq_id].tail;
407
- if (tail_id >= 0) {
408
- llama_kv_cell & cell = cells[tail_id];
409
- if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
410
- cell.pos /= d;
411
- }
412
- }
413
- }
414
-
415
- return;
416
- }
417
-
418
311
  for (uint32_t i = 0; i < size; ++i) {
419
312
  if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
420
313
  has_shift = true;
@@ -428,250 +321,165 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
428
321
  }
429
322
  }
430
323
 
431
- llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
432
- llama_pos result = 0;
324
+ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
325
+ llama_pos result = std::numeric_limits<llama_pos>::max();
433
326
 
434
327
  for (uint32_t i = 0; i < size; ++i) {
435
328
  if (cells[i].has_seq_id(seq_id)) {
436
- result = std::max(result, cells[i].pos);
329
+ result = std::min(result, cells[i].pos);
437
330
  }
438
331
  }
439
332
 
440
- return result;
441
- }
442
-
443
- void llama_kv_cache_unified::defrag() {
444
- if (!recurrent) {
445
- do_defrag = true;
333
+ if (result == std::numeric_limits<llama_pos>::max()) {
334
+ result = -1;
446
335
  }
336
+
337
+ return result;
447
338
  }
448
339
 
449
- void llama_kv_cache_unified::restore() {
450
- if (pending.ranges.empty()) {
451
- return;
452
- }
340
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
341
+ llama_pos result = -1;
453
342
 
454
- // TODO: tmp - move to llama_kv_cache_recurrent
455
- if (recurrent) {
456
- seq_rm(-1, -1, -1);
457
- return;
343
+ for (uint32_t i = 0; i < size; ++i) {
344
+ if (cells[i].has_seq_id(seq_id)) {
345
+ result = std::max(result, cells[i].pos);
346
+ }
458
347
  }
459
348
 
460
- uint32_t new_head = size;
461
-
462
- for (auto & range : pending.ranges) {
463
- for (uint32_t i = range.c0; i < range.c1; ++i) {
464
- cells[i].seq_id.clear();
465
-
466
- // keep count of the number of used cells
467
- if (cells[i].pos >= 0) {
468
- used--;
469
- }
349
+ return result;
350
+ }
470
351
 
471
- cells[i].pos = -1;
472
- cells[i].src = -1;
352
+ void llama_kv_cache_unified::restore() {
353
+ for (const auto & [id, cell] : recovery.cells) {
354
+ // TODO: move to new `struct kv_cells`
355
+ const bool is_empty0 = cells[id].is_empty();
356
+ const bool is_empty1 = cell.is_empty();
357
+
358
+ if (!is_empty0 && is_empty1) {
359
+ used--;
360
+ } else if (is_empty0 && !is_empty1) {
361
+ used++;
473
362
  }
474
363
 
475
- new_head = std::min(new_head, range.c0);
364
+ cells[id] = cell;
476
365
  }
477
366
 
478
- if (new_head != size && new_head < head) {
479
- head = new_head;
480
- }
367
+ recovery.clear();
481
368
  }
482
369
 
483
370
  void llama_kv_cache_unified::commit() {
484
- // TODO: tmp - move to llama_kv_cache_recurrent
485
- if (recurrent) {
486
- return;
487
- }
488
-
489
- if (pending.ranges.empty()) {
490
- LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
491
- __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
371
+ if (recovery.cells.empty()) {
372
+ LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
373
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
492
374
  return;
493
375
  }
494
376
 
495
- pending.ranges.clear();
377
+ recovery.clear();
496
378
  }
497
379
 
498
- bool llama_kv_cache_unified::get_can_shift() const {
499
- return can_shift;
500
- }
380
+ bool llama_kv_cache_unified::update(llama_context & lctx) {
381
+ bool need_reserve = false;
501
382
 
502
- bool llama_kv_cache_unified::find_slot(
503
- const llama_ubatch & ubatch) {
504
- const uint32_t n_tokens = ubatch.n_tokens;
505
- const uint32_t n_seqs = ubatch.n_seqs;
506
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
383
+ auto * sched = lctx.get_sched();
507
384
 
508
- // if we have enough unused cells before the current head ->
509
- // better to start searching from the beginning of the cache, hoping to fill it
510
- if (head > used + 2*ubatch.n_tokens) {
511
- head = 0;
512
- }
385
+ if (has_shift) {
386
+ if (!get_can_shift()) {
387
+ LM_GGML_ABORT("The current KV cache / model configuration does not support K-shift");
388
+ }
513
389
 
514
- if (recurrent) {
515
- // For recurrent state architectures (like Mamba or RWKV),
516
- // each cache cell can store the state for a whole sequence.
517
- // A slot should be always be contiguous.
390
+ LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
518
391
 
519
- // can only process batches with an equal number of new tokens in each sequence
520
- LM_GGML_ASSERT(ubatch.equal_seqs);
392
+ // apply K-shift if needed
393
+ if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
394
+ lm_ggml_backend_sched_reset(sched);
521
395
 
522
- int32_t min = size - 1;
523
- int32_t max = 0;
396
+ auto * gf = lctx.graph_init();
524
397
 
525
- // everything should fit if all seq_ids are smaller than the max
526
- for (uint32_t s = 0; s < n_seqs; ++s) {
527
- const uint32_t n_seq_id = ubatch.n_seq_id[s];
528
- for (uint32_t j = 0; j < n_seq_id; ++j) {
529
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
398
+ auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
530
399
 
531
- if (seq_id < 0 || (uint32_t) seq_id >= size) {
532
- // too big seq_id
533
- // TODO: would it be possible to resize the cache instead?
534
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
535
- return false;
536
- }
537
- if (j > 0) {
538
- llama_kv_cell & seq = cells[seq_id];
539
- if (seq.tail >= 0) {
540
- llama_kv_cell & cell = cells[seq.tail];
541
- // clear cells from seq_ids that become shared
542
- // (should not normally happen, but let's handle it anyway)
543
- cell.seq_id.erase(seq_id);
544
- seq.tail = -1;
545
- if (cell.seq_id.empty()) {
546
- cell.pos = -1;
547
- cell.src = -1;
548
- used -= 1;
549
- }
550
- }
551
- }
552
- }
400
+ lm_ggml_backend_sched_alloc_graph(sched, gf);
401
+
402
+ res->set_inputs(nullptr);
403
+
404
+ lctx.graph_compute(gf, false);
405
+
406
+ need_reserve = true;
553
407
  }
554
408
 
555
- #ifndef NDEBUG
556
409
  {
557
- std::vector<int32_t> tails_verif;
558
- tails_verif.assign(size, -1);
559
- for (uint32_t i = 0; i < size; ++i) {
560
- llama_kv_cell & cell = cells[i];
561
- for (llama_seq_id seq_id : cell.seq_id) {
562
- if (tails_verif[seq_id] != -1) {
563
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
564
- }
565
- tails_verif[seq_id] = i;
566
- }
567
- }
410
+ has_shift = false;
411
+
568
412
  for (uint32_t i = 0; i < size; ++i) {
569
- if (tails_verif[i] != cells[i].tail) {
570
- LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
571
- }
413
+ cells[i].delta = 0;
572
414
  }
573
415
  }
574
- #endif
416
+ }
575
417
 
576
- // find next empty cell
577
- uint32_t next_empty_cell = head;
418
+ if (do_defrag) {
419
+ LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
578
420
 
579
- for (uint32_t i = 0; i < size; ++i) {
580
- if (next_empty_cell >= size) { next_empty_cell -= size; }
581
- llama_kv_cell & cell = cells[next_empty_cell];
582
- if (cell.is_empty()) { break; }
583
- next_empty_cell += 1;
584
- }
421
+ if (defrag_prepare(lctx.graph_max_nodes())) {
422
+ lm_ggml_backend_sched_reset(sched);
585
423
 
586
- // find usable cell range
587
- for (uint32_t s = 0; s < n_seqs; ++s) {
588
- const llama_seq_id seq_id = ubatch.seq_id[s][0];
589
- llama_kv_cell & seq_meta = cells[seq_id];
590
- bool has_cell = false;
591
- if (seq_meta.tail >= 0) {
592
- llama_kv_cell & cell = cells[seq_meta.tail];
593
- LM_GGML_ASSERT(cell.has_seq_id(seq_id));
594
- // does this seq_id "own" the cell?
595
- if (cell.seq_id.size() == 1) { has_cell = true; }
596
- }
597
- if (!has_cell) {
598
- llama_kv_cell & empty_cell = cells[next_empty_cell];
599
- LM_GGML_ASSERT(empty_cell.is_empty());
600
- // copy old tail into the empty cell
601
- if (seq_meta.tail >= 0) {
602
- llama_kv_cell & orig_cell = cells[seq_meta.tail];
603
- empty_cell.pos = orig_cell.pos;
604
- empty_cell.src = orig_cell.src;
605
- orig_cell.seq_id.erase(seq_id);
606
- empty_cell.seq_id.insert(seq_id); // will be overwritten
607
- }
608
- seq_meta.tail = next_empty_cell;
609
- // find next empty cell
610
- if (s + 1 < n_seqs) {
611
- next_empty_cell += 1;
612
- for (uint32_t i = 0; i < size; ++i) {
613
- if (next_empty_cell >= size) { next_empty_cell -= size; }
614
- llama_kv_cell & cell = cells[next_empty_cell];
615
- if (cell.is_empty()) { break; }
616
- next_empty_cell += 1;
617
- }
618
- }
619
- }
620
- if (min > seq_meta.tail) { min = seq_meta.tail; }
621
- if (max < seq_meta.tail) { max = seq_meta.tail; }
622
- }
424
+ auto * gf = lctx.graph_init();
623
425
 
624
- // gather and re-order
625
- for (uint32_t s = 0; s < n_seqs; ++s) {
626
- int32_t dst_id = s + min;
627
- int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
628
- if (dst_id != src_id) {
629
- llama_kv_cell & dst_cell = cells[dst_id];
630
- llama_kv_cell & src_cell = cells[src_id];
426
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
631
427
 
632
- std::swap(dst_cell.pos, src_cell.pos);
633
- std::swap(dst_cell.src, src_cell.src);
634
- std::swap(dst_cell.seq_id, src_cell.seq_id);
428
+ lm_ggml_backend_sched_alloc_graph(sched, gf);
635
429
 
636
- // swap tails (assuming they NEVER overlap)
637
- for (const llama_seq_id seq_id : src_cell.seq_id) {
638
- cells[seq_id].tail = src_id;
639
- }
640
- for (const llama_seq_id seq_id : dst_cell.seq_id) {
641
- cells[seq_id].tail = dst_id;
642
- }
643
- }
644
- }
430
+ res->set_inputs(nullptr);
645
431
 
646
- // update the pos of the used seqs
647
- for (uint32_t s = 0; s < n_seqs; ++s) {
648
- const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
649
- int32_t cell_id = s + min;
650
- llama_kv_cell & cell = cells[cell_id];
432
+ lctx.graph_compute(gf, false);
651
433
 
652
- if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
653
- // What should happen when the pos backtracks or skips a value?
654
- // Clearing the state mid-batch would require special-casing which isn't done.
655
- LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
656
- __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
657
- }
658
- cell.pos = last_pos;
659
- cell.seq_id.clear();
660
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
661
- const llama_seq_id seq_id = ubatch.seq_id[s][j];
662
- cell.seq_id.insert(seq_id);
663
- cells[seq_id].tail = cell_id;
664
- }
434
+ need_reserve = true;
665
435
  }
666
436
 
667
- // allow getting the range of used cells, from head to head + n
668
- head = min;
669
- n = max - min + 1;
670
- used = std::count_if(cells.begin(), cells.end(),
671
- [](const llama_kv_cell& cell){ return !cell.is_empty(); });
437
+ do_defrag = false;
438
+ }
439
+
440
+ return need_reserve;
441
+ }
442
+
443
+ void llama_kv_cache_unified::defrag_sched(float thold) {
444
+ // - do not defrag small contexts (i.e. < 2048 tokens)
445
+ // - count the padding towards the number of used tokens
446
+ const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
447
+
448
+ // queue defragmentation for next llama_kv_cache_update
449
+ if (fragmentation > thold) {
450
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
451
+
452
+ do_defrag = true;
453
+ }
454
+ }
455
+
456
+ void llama_kv_cache_unified::set_full() {
457
+ n = size;
458
+
459
+ // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
460
+ // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
461
+ // we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
462
+ // setting it to 0 is the simplest way to achieve that
463
+ // ref: https://github.com/ggml-org/llama.cpp/issues/13359
464
+ head = 0;
465
+ }
466
+
467
+ llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
468
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
469
+ }
470
+
471
+ llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
472
+ LM_GGML_UNUSED(embd_pooled);
473
+ return sbatch.split_simple(n_ubatch);
474
+ }
475
+
476
+ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
477
+ const uint32_t n_tokens = ubatch.n_tokens;
672
478
 
673
- // sanity check
674
- return n >= n_seqs;
479
+ // if we have enough unused cells before the current head ->
480
+ // better to start searching from the beginning of the cache, hoping to fill it
481
+ if (head > used + 2*ubatch.n_tokens) {
482
+ head = 0;
675
483
  }
676
484
 
677
485
  // otherwise, one cell per token.
@@ -681,6 +489,29 @@ bool llama_kv_cache_unified::find_slot(
681
489
  return false;
682
490
  }
683
491
 
492
+ //#define FIND_SLOT_DEBUG 1
493
+ #if FIND_SLOT_DEBUG
494
+ LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
495
+
496
+ // for debugging
497
+ {
498
+ std::string ss;
499
+ if (n_swa > 0) {
500
+ for (uint32_t i = 0; i < size; ++i) {
501
+ if (cells[i].pos == -1) {
502
+ ss += '.';
503
+ } else {
504
+ ss += std::to_string(*cells[i].seq_id.begin());
505
+ }
506
+ if (i%256 == 255) {
507
+ ss += '\n';
508
+ }
509
+ }
510
+ }
511
+ LLAMA_LOG_WARN("\n%s\n", ss.c_str());
512
+ }
513
+ #endif
514
+
684
515
  uint32_t n_tested = 0;
685
516
 
686
517
  while (true) {
@@ -704,205 +535,1925 @@ bool llama_kv_cache_unified::find_slot(
704
535
  break;
705
536
  }
706
537
 
707
- if (n_tested >= size) {
708
- //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
709
- return false;
710
- }
538
+ if (n_tested >= size) {
539
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
540
+ return false;
541
+ }
542
+ }
543
+
544
+ for (uint32_t i = 0; i < n_tokens; ++i) {
545
+ // remember the original state
546
+ if (recovery.cells.find(head + i) == recovery.cells.end()) {
547
+ recovery.cells[head + i] = cells[head + i];
548
+ }
549
+
550
+ cells[head + i].pos = ubatch.pos[i];
551
+
552
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
553
+ cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
554
+ }
555
+ }
556
+
557
+ used += n_tokens;
558
+
559
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
560
+ // after enough generations, the benefit from this heuristic disappears
561
+ // if we start defragmenting the cache, the benefit from this will be more important
562
+ n = std::min(size, std::max(n_pad, LM_GGML_PAD(cell_max(), n_pad)));
563
+
564
+ #ifdef FIND_SLOT_DEBUG
565
+ LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
566
+ #endif
567
+
568
+ return true;
569
+ }
570
+
571
+ bool llama_kv_cache_unified::get_can_shift() const {
572
+ return true;
573
+ }
574
+
575
+ uint32_t llama_kv_cache_unified::get_n() const {
576
+ return n;
577
+ }
578
+
579
+ uint32_t llama_kv_cache_unified::get_size() const {
580
+ return size;
581
+ }
582
+
583
+ lm_ggml_tensor * llama_kv_cache_unified::get_k(lm_ggml_context * ctx, int32_t il) const {
584
+ const int32_t ikv = map_layer_ids.at(il);
585
+
586
+ auto * k = layers[ikv].k;
587
+
588
+ return lm_ggml_view_3d(ctx, k,
589
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n,
590
+ lm_ggml_row_size(k->type, hparams.n_embd_head_k),
591
+ lm_ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
592
+ 0);
593
+ }
594
+
595
+ lm_ggml_tensor * llama_kv_cache_unified::get_v(lm_ggml_context * ctx, int32_t il) const {
596
+ const int32_t ikv = map_layer_ids.at(il);
597
+
598
+ auto * v = layers[ikv].v;
599
+
600
+ if (!v_trans) {
601
+ // note: v->nb[1] <= v->nb[2]
602
+ return lm_ggml_view_3d(ctx, v,
603
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n,
604
+ lm_ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
605
+ lm_ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
606
+ 0);
607
+ }
608
+
609
+ // note: v->nb[1] > v->nb[2]
610
+ return lm_ggml_view_3d(ctx, v,
611
+ n, hparams.n_head_kv(il), hparams.n_embd_head_v,
612
+ lm_ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
613
+ lm_ggml_row_size(v->type, v->ne[1]), // v->nb[2]
614
+ 0);
615
+ }
616
+
617
+ lm_ggml_tensor * llama_kv_cache_unified::cpy_k(lm_ggml_context * ctx, lm_ggml_tensor * k_cur, int32_t il) const {
618
+ const int32_t ikv = map_layer_ids.at(il);
619
+
620
+ auto * k = layers[ikv].k;
621
+
622
+ const int64_t n_tokens = k_cur->ne[2];
623
+
624
+ lm_ggml_tensor * k_view = lm_ggml_view_1d(ctx, k,
625
+ n_tokens*hparams.n_embd_k_gqa(il),
626
+ lm_ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
627
+
628
+ return lm_ggml_cpy(ctx, k_cur, k_view);
629
+ }
630
+
631
+ lm_ggml_tensor * llama_kv_cache_unified::cpy_v(lm_ggml_context * ctx, lm_ggml_tensor * v_cur, int32_t il) const {
632
+ const int32_t ikv = map_layer_ids.at(il);
633
+
634
+ auto * v = layers[ikv].v;
635
+
636
+ const int64_t n_tokens = v_cur->ne[2];
637
+
638
+ v_cur = lm_ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
639
+
640
+ lm_ggml_tensor * v_view = nullptr;
641
+
642
+ if (!v_trans) {
643
+ v_view = lm_ggml_view_1d(ctx, v,
644
+ n_tokens*hparams.n_embd_v_gqa(il),
645
+ lm_ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
646
+ } else {
647
+ // note: the V cache is transposed when not using flash attention
648
+ v_view = lm_ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
649
+ (v->ne[1])*lm_ggml_element_size(v),
650
+ ( head)*lm_ggml_element_size(v));
651
+
652
+ v_cur = lm_ggml_transpose(ctx, v_cur);
653
+ }
654
+
655
+ return lm_ggml_cpy(ctx, v_cur, v_view);
656
+ }
657
+
658
+ void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
659
+ // no pruning is needed when the cache does not use SWA
660
+ LM_GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
661
+
662
+ int n_attended = 0;
663
+
664
+ for (uint32_t i = 0; i < size; ++i) {
665
+ const llama_pos p0 = cells[i].pos;
666
+
667
+ if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
668
+ n_attended++;
669
+ }
670
+
671
+ if (is_masked_swa(p0, pmax)) {
672
+ if (seq_id < 0) {
673
+ cells[i].seq_id.clear();
674
+ } else if (cells[i].has_seq_id(seq_id)) {
675
+ cells[i].seq_id.erase(seq_id);
676
+ } else {
677
+ continue;
678
+ }
679
+
680
+ if (cells[i].is_empty()) {
681
+ // keep count of the number of used cells
682
+ if (cells[i].pos >= 0) {
683
+ used--;
684
+ }
685
+
686
+ cells[i].pos = -1;
687
+ }
688
+ }
689
+ }
690
+
691
+ if (n_attended < std::min<int>(n_swa, pmin)) {
692
+ LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
693
+ }
694
+ }
695
+
696
+ void llama_kv_cache_unified::set_input_kq_mask(lm_ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
697
+ const int64_t n_tokens = ubatch->n_tokens;
698
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
699
+ const int64_t n_seqs = ubatch->n_seqs;
700
+
701
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
702
+ float * data = (float *) dst->data;
703
+
704
+ const int64_t n_kv = n;
705
+
706
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
707
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
708
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
709
+ // Causal mask:
710
+ // xxx-------
711
+ // xxxx------
712
+ // xxxxx-----
713
+ // Non-causal mask:
714
+ // xxxxx-----
715
+ // xxxxx-----
716
+ // xxxxx-----
717
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
718
+ for (int h = 0; h < 1; ++h) {
719
+ for (int s = 0; s < n_seqs; ++s) {
720
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
721
+
722
+ for (int j = 0; j < n_seq_tokens; ++j) {
723
+ const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
724
+
725
+ for (int i = 0; i < n_kv; ++i) {
726
+ const llama_pos p0 = cells[i].pos;
727
+
728
+ bool masked = false;
729
+
730
+ // mask the token if not the same sequence
731
+ masked = masked || (!cells[i].has_seq_id(seq_id));
732
+
733
+ // mask future tokens
734
+ masked = masked || (causal_attn && p0 > p1);
735
+
736
+ // apply SWA if any
737
+ masked = masked || (is_masked_swa(p0, p1));
738
+
739
+ float f = 0.0f;
740
+
741
+ if (masked) {
742
+ f = -INFINITY;
743
+ } else if (hparams.use_alibi) {
744
+ f = -std::abs(p0 - p1);
745
+ }
746
+
747
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
748
+ }
749
+ }
750
+ }
751
+
752
+ // mask padded tokens
753
+ if (data) {
754
+ for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
755
+ for (int j = 0; j < n_kv; ++j) {
756
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
757
+ }
758
+ }
759
+ }
760
+ }
761
+ }
762
+
763
+ void llama_kv_cache_unified::set_input_k_shift(lm_ggml_tensor * dst) const {
764
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
765
+
766
+ int32_t * data = (int32_t *) dst->data;
767
+
768
+ for (uint32_t i = 0; i < size; ++i) {
769
+ data[i] = cells[i].delta;
770
+ }
771
+ }
772
+
773
+ void llama_kv_cache_unified::set_input_pos_bucket(lm_ggml_tensor * dst, const llama_ubatch * ubatch) const {
774
+ const int64_t n_tokens = ubatch->n_tokens;
775
+
776
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
777
+ LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
778
+
779
+ int32_t * data = (int32_t *) dst->data;
780
+
781
+ const int64_t n_kv = n;
782
+
783
+ for (int h = 0; h < 1; ++h) {
784
+ for (int j = 0; j < n_tokens; ++j) {
785
+ for (int i = 0; i < n_kv; ++i) {
786
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
787
+ }
788
+ }
789
+ }
790
+ }
791
+
792
+ size_t llama_kv_cache_unified::total_size() const {
793
+ size_t size = 0;
794
+
795
+ for (const auto & buf : bufs) {
796
+ size += lm_ggml_backend_buffer_get_size(buf.get());
797
+ }
798
+
799
+ return size;
800
+ }
801
+
802
+ size_t llama_kv_cache_unified::size_k_bytes() const {
803
+ size_t size_k_bytes = 0;
804
+
805
+ for (const auto & layer : layers) {
806
+ size_k_bytes += lm_ggml_nbytes(layer.k);
807
+ }
808
+
809
+ return size_k_bytes;
810
+ }
811
+
812
+ size_t llama_kv_cache_unified::size_v_bytes() const {
813
+ size_t size_v_bytes = 0;
814
+
815
+ for (const auto & layer : layers) {
816
+ size_v_bytes += lm_ggml_nbytes(layer.v);
817
+ }
818
+
819
+ return size_v_bytes;
820
+ }
821
+
822
+ lm_ggml_tensor * llama_kv_cache_unified::build_rope_shift(
823
+ const llama_cparams & cparams,
824
+ lm_ggml_context * ctx,
825
+ lm_ggml_tensor * cur,
826
+ lm_ggml_tensor * shift,
827
+ lm_ggml_tensor * factors,
828
+ float freq_base,
829
+ float freq_scale) const {
830
+ const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
831
+
832
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
833
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
834
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
835
+
836
+ const auto & n_rot = hparams.n_rot;
837
+ const auto & rope_type = hparams.rope_type;
838
+
839
+ // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
840
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
841
+ const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
842
+
843
+ lm_ggml_tensor * tmp;
844
+
845
+ if (lm_ggml_is_quantized(cur->type)) {
846
+ // dequantize to f32 -> RoPE -> quantize back
847
+ tmp = lm_ggml_cast(ctx, cur, LM_GGML_TYPE_F32);
848
+
849
+ tmp = lm_ggml_rope_ext(ctx, tmp,
850
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
851
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
852
+
853
+ tmp = lm_ggml_cpy(ctx, tmp, cur);
854
+ } else {
855
+ // we rotate only the first n_rot dimensions
856
+ tmp = lm_ggml_rope_ext_inplace(ctx, cur,
857
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
858
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
859
+ }
860
+
861
+ return tmp;
862
+ }
863
+
864
+ class llm_graph_input_k_shift : public llm_graph_input_i {
865
+ public:
866
+ llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
867
+ virtual ~llm_graph_input_k_shift() = default;
868
+
869
+ void set_input(const llama_ubatch * ubatch) override;
870
+
871
+ lm_ggml_tensor * k_shift; // I32 [kv_size]
872
+
873
+ const llama_kv_cache_unified * kv_self;
874
+ };
875
+
876
+ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
877
+ LM_GGML_UNUSED(ubatch);
878
+
879
+ if (k_shift) {
880
+ kv_self->set_input_k_shift(k_shift);
881
+ }
882
+ }
883
+
884
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
885
+ const llama_cparams & cparams,
886
+ lm_ggml_context * ctx,
887
+ lm_ggml_cgraph * gf) const {
888
+ auto res = std::make_unique<llm_graph_result>();
889
+
890
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
891
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
892
+
893
+ //LM_GGML_ASSERT(kv_self->size == n_ctx);
894
+
895
+ auto inp = std::make_unique<llm_graph_input_k_shift>(this);
896
+
897
+ inp->k_shift = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, cparams.n_ctx);
898
+ lm_ggml_set_input(inp->k_shift);
899
+
900
+ for (const auto & layer : layers) {
901
+ const uint32_t il = layer.il;
902
+
903
+ const int64_t n_head_kv = hparams.n_head_kv(il);
904
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
905
+
906
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
907
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
908
+
909
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
910
+
911
+ lm_ggml_tensor * k =
912
+ lm_ggml_view_3d(ctx, layer.k,
913
+ n_embd_head_k, n_head_kv, size,
914
+ lm_ggml_row_size(layer.k->type, n_embd_head_k),
915
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
916
+ 0);
917
+
918
+ lm_ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
919
+
920
+ lm_ggml_build_forward_expand(gf, cur);
921
+ }
922
+
923
+ res->add_input(std::move(inp));
924
+
925
+ return res;
926
+ }
927
+
928
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
929
+ const llama_cparams & cparams,
930
+ lm_ggml_context * ctx,
931
+ lm_ggml_cgraph * gf) const {
932
+ auto res = std::make_unique<llm_graph_result>();
933
+
934
+ const auto & ids = defrag_info.ids;
935
+
936
+ #if 0
937
+ // CPU defrag
938
+ //
939
+ // TODO: optimizations are possible:
940
+ // - multiple threads
941
+ // - avoid copying to the host memory when already there
942
+ //
943
+ // likely not worth the effort, as we have lm_ggml_graph based defrag
944
+ //
945
+
946
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
947
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
948
+
949
+ const uint32_t kv_size = size;
950
+
951
+ std::vector<uint8_t> buf_k;
952
+ std::vector<uint8_t> buf_v;
953
+
954
+ for (uint32_t il = 0; il < n_layer; ++il) {
955
+ const size_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
956
+ const size_t k_size = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
957
+
958
+ const size_t v_size_el = lm_ggml_type_size(v_l[il]->type);
959
+ const size_t v_size = lm_ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
960
+
961
+ buf_k.resize(k_size);
962
+ buf_v.resize(v_size);
963
+
964
+ lm_ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
965
+ lm_ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
966
+
967
+ // batch move [i, i+nm) to [id, id+nm)
968
+ // note: cells can move only to a lower index
969
+ for (uint32_t i = 0; i < n_kv; ++i) {
970
+ const uint32_t id = ids[i];
971
+
972
+ if (i == id || id == n_kv) {
973
+ continue;
974
+ }
975
+
976
+ uint32_t nm = 1;
977
+
978
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
979
+ nm++;
980
+ }
981
+
982
+ // move keys
983
+ {
984
+ const int64_t os = i*k_size_row;
985
+ const int64_t od = id*k_size_row;
986
+
987
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
988
+ }
989
+
990
+ // move values (note: they are transposed)
991
+ {
992
+ const int64_t os = i;
993
+ const int64_t od = id;
994
+
995
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
996
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
997
+ }
998
+ }
999
+
1000
+ i += nm - 1;
1001
+ }
1002
+
1003
+ lm_ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
1004
+ lm_ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
1005
+ }
1006
+ #else
1007
+ for (uint32_t i = 0; i < ids.size(); ++i) {
1008
+ const uint32_t id = ids[i];
1009
+
1010
+ if (i == id || id == ids.size()) {
1011
+ continue;
1012
+ }
1013
+
1014
+ uint32_t nm = 1;
1015
+
1016
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
1017
+ nm++;
1018
+ }
1019
+
1020
+ for (const auto & layer : layers) {
1021
+ const uint32_t il = layer.il;
1022
+
1023
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1024
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1025
+
1026
+ lm_ggml_tensor * view_k_src = lm_ggml_view_2d(ctx, layer.k,
1027
+ n_embd_k_gqa, nm,
1028
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
1029
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa*i));
1030
+
1031
+ lm_ggml_tensor * view_k_dst = lm_ggml_view_2d(ctx, layer.k,
1032
+ n_embd_k_gqa, nm,
1033
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
1034
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa*id));
1035
+
1036
+ lm_ggml_tensor * view_v_src;
1037
+ lm_ggml_tensor * view_v_dst;
1038
+
1039
+ if (cparams.flash_attn) {
1040
+ // NOTE: the V cache is not transposed when using flash attention
1041
+ view_v_src = lm_ggml_view_2d(ctx, layer.v,
1042
+ n_embd_v_gqa, nm,
1043
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa),
1044
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa*i));
1045
+
1046
+ view_v_dst = lm_ggml_view_2d(ctx, layer.v,
1047
+ n_embd_v_gqa, nm,
1048
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa),
1049
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa*id));
1050
+ } else {
1051
+ view_v_src = lm_ggml_view_2d(ctx, layer.v,
1052
+ nm, n_embd_v_gqa,
1053
+ lm_ggml_row_size(layer.v->type, size),
1054
+ lm_ggml_row_size(layer.v->type, i));
1055
+
1056
+ view_v_dst = lm_ggml_view_2d(ctx, layer.v,
1057
+ nm, n_embd_v_gqa,
1058
+ lm_ggml_row_size(layer.v->type, size),
1059
+ lm_ggml_row_size(layer.v->type, id));
1060
+ }
1061
+
1062
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx, view_k_src, view_k_dst));
1063
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx, view_v_src, view_v_dst));
1064
+ }
1065
+
1066
+ i += nm - 1;
1067
+ }
1068
+
1069
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
1070
+ #endif
1071
+
1072
+ return res;
1073
+ }
1074
+
1075
+ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
1076
+ const uint32_t n_layer = layers.size();
1077
+
1078
+ const uint32_t n_kv = cell_max();
1079
+ const uint32_t n_used = used;
1080
+
1081
+ assert(n_used <= n_kv);
1082
+
1083
+ //const int64_t t_start = lm_ggml_time_us();
1084
+
1085
+ // number of cells moved
1086
+ uint32_t n_moves = 0;
1087
+
1088
+ // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
1089
+ // - source view, destination view, copy operation
1090
+ // - x2 for keys and values
1091
+ //const uint32_t max_moves = max_nodes()/(6*n_layer);
1092
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
1093
+ const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
1094
+
1095
+ // determine which KV cells to move where
1096
+ //
1097
+ // cell i moves to ids[i]
1098
+ //
1099
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
1100
+ //
1101
+ auto & ids = defrag_info.ids;
1102
+
1103
+ ids.clear();
1104
+ ids.resize(n_kv, n_kv);
1105
+
1106
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
1107
+ const auto & cell0 = cells[i0];
1108
+
1109
+ if (!cell0.is_empty()) {
1110
+ ids[i0] = i0;
1111
+
1112
+ continue;
1113
+ }
1114
+
1115
+ // found a hole - fill it with data from the end of the cache
1116
+
1117
+ uint32_t nh = 1;
1118
+
1119
+ // determine the size of the hole
1120
+ while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
1121
+ nh++;
1122
+ }
1123
+
1124
+ uint32_t nf = 0;
1125
+ uint32_t is = n_kv - 1;
1126
+
1127
+ // starting from the end, find nh non-empty cells
1128
+ for (; is > i0; --is) {
1129
+ const auto & cell1 = cells[is];
1130
+
1131
+ if (cell1.is_empty() || ids[is] != n_kv) {
1132
+ continue;
1133
+ }
1134
+
1135
+ // non-empty cell which is not yet moved
1136
+ nf++;
1137
+
1138
+ if (nf == nh) {
1139
+ break;
1140
+ }
1141
+ }
1142
+
1143
+ // this can only happen if `n_used` is not accurate, which would be a bug
1144
+ LM_GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
1145
+
1146
+ nf = 0;
1147
+
1148
+ uint32_t i1 = is;
1149
+
1150
+ // are we moving a continuous block of memory?
1151
+ bool cont = false;
1152
+
1153
+ // should we stop searching for the next move?
1154
+ bool stop = false;
1155
+
1156
+ // go back and move the nf cells to the hole
1157
+ for (; i1 < n_kv; ++i1) {
1158
+ auto & cell1 = cells[i1];
1159
+
1160
+ if (cell1.is_empty() || ids[i1] != n_kv) {
1161
+ if (n_moves == max_moves) {
1162
+ stop = true;
1163
+ break;
1164
+ }
1165
+
1166
+ cont = false;
1167
+ continue;
1168
+ }
1169
+
1170
+ // this cell goes to (i0 + nf)
1171
+ ids[i1] = i0 + nf;
1172
+
1173
+ // move the cell meta data
1174
+ cells[i0 + nf] = cell1;
1175
+
1176
+ // clear the old cell and move the head there
1177
+ cell1 = kv_cell();
1178
+ head = n_used;
1179
+
1180
+ if (!cont) {
1181
+ n_moves++;
1182
+ cont = true;
1183
+ }
1184
+
1185
+ nf++;
1186
+
1187
+ if (nf == nh) {
1188
+ break;
1189
+ }
1190
+ }
1191
+
1192
+ if (stop || n_moves == max_moves) {
1193
+ break;
1194
+ }
1195
+
1196
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
1197
+
1198
+ i0 += nh - 1;
1199
+ }
1200
+
1201
+ if (n_moves == 0) {
1202
+ return false;
1203
+ }
1204
+
1205
+ LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
1206
+
1207
+ LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
1208
+
1209
+ return true;
1210
+ }
1211
+
1212
+ uint32_t llama_kv_cache_unified::cell_max() const {
1213
+ for (uint32_t i = size; i > 0; --i) {
1214
+ const kv_cell & cell = cells[i - 1];
1215
+
1216
+ if (cell.pos >= 0 && !cell.is_empty()) {
1217
+ return i;
1218
+ }
1219
+ }
1220
+
1221
+ return 0;
1222
+ }
1223
+
1224
+ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1225
+ if (p0 < 0) {
1226
+ return true;
1227
+ }
1228
+
1229
+ switch (swa_type) {
1230
+ case LLAMA_SWA_TYPE_NONE:
1231
+ {
1232
+ } break;
1233
+ case LLAMA_SWA_TYPE_STANDARD:
1234
+ {
1235
+ if (p1 - p0 >= (int32_t) n_swa) {
1236
+ return true;
1237
+ }
1238
+ } break;
1239
+ case LLAMA_SWA_TYPE_CHUNKED:
1240
+ {
1241
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1242
+
1243
+ if (p0 < pos_chunk_start) {
1244
+ return true;
1245
+ }
1246
+ } break;
1247
+ }
1248
+
1249
+ return false;
1250
+ }
1251
+
1252
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1253
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
1254
+ uint32_t cell_count = 0;
1255
+
1256
+ // Count the number of cells with the specified seq_id
1257
+ // Find all the ranges of cells with this seq id (or all, when -1)
1258
+ uint32_t cell_range_begin = size;
1259
+ for (uint32_t i = 0; i < size; ++i) {
1260
+ const auto & cell = cells[i];
1261
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
1262
+ ++cell_count;
1263
+ if (cell_range_begin == size) {
1264
+ cell_range_begin = i;
1265
+ }
1266
+ } else {
1267
+ if (cell_range_begin != size) {
1268
+ cell_ranges.emplace_back(cell_range_begin, i);
1269
+ cell_range_begin = size;
1270
+ }
1271
+ }
1272
+ }
1273
+ if (cell_range_begin != size) {
1274
+ cell_ranges.emplace_back(cell_range_begin, size);
1275
+ }
1276
+
1277
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
1278
+ uint32_t cell_count_check = 0;
1279
+ for (const auto & range : cell_ranges) {
1280
+ cell_count_check += range.second - range.first;
1281
+ }
1282
+ LM_GGML_ASSERT(cell_count == cell_count_check);
1283
+
1284
+ io.write(&cell_count, sizeof(cell_count));
1285
+
1286
+ state_write_meta(io, cell_ranges, seq_id);
1287
+ state_write_data(io, cell_ranges);
1288
+ }
1289
+
1290
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1291
+ uint32_t cell_count;
1292
+ io.read_to(&cell_count, sizeof(cell_count));
1293
+
1294
+ bool res = true;
1295
+ res = res && state_read_meta(io, cell_count, seq_id);
1296
+ res = res && state_read_data(io, cell_count);
1297
+
1298
+ if (!res) {
1299
+ if (seq_id == -1) {
1300
+ clear();
1301
+ } else {
1302
+ seq_rm(seq_id, -1, -1);
1303
+ }
1304
+ throw std::runtime_error("failed to restore kv cache");
1305
+ }
1306
+ }
1307
+
1308
+ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
1309
+ for (const auto & range : cell_ranges) {
1310
+ for (uint32_t i = range.first; i < range.second; ++i) {
1311
+ const auto & cell = cells[i];
1312
+ const llama_pos pos = cell.pos;
1313
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
1314
+
1315
+ io.write(&pos, sizeof(pos));
1316
+ io.write(&n_seq_id, sizeof(n_seq_id));
1317
+
1318
+ if (n_seq_id) {
1319
+ for (auto seq_id : cell.seq_id) {
1320
+ io.write(&seq_id, sizeof(seq_id));
1321
+ }
1322
+ }
1323
+ }
1324
+ }
1325
+ }
1326
+
1327
+ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
1328
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
1329
+ const uint32_t n_layer = layers.size();
1330
+
1331
+ io.write(&v_trans, sizeof(v_trans));
1332
+ io.write(&n_layer, sizeof(n_layer));
1333
+
1334
+ std::vector<uint8_t> tmp_buf;
1335
+
1336
+ // Iterate and write all the keys first, each row is a cell
1337
+ // Get whole range at a time
1338
+ for (const auto & layer : layers) {
1339
+ const uint32_t il = layer.il;
1340
+
1341
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1342
+
1343
+ // Write key type
1344
+ const int32_t k_type_i = (int32_t)layer.k->type;
1345
+ io.write(&k_type_i, sizeof(k_type_i));
1346
+
1347
+ // Write row size of key
1348
+ const uint64_t k_size_row = lm_ggml_row_size(layer.k->type, n_embd_k_gqa);
1349
+ io.write(&k_size_row, sizeof(k_size_row));
1350
+
1351
+ // Read each range of cells of k_size length each into tmp_buf and write out
1352
+ for (const auto & range : cell_ranges) {
1353
+ const size_t range_size = range.second - range.first;
1354
+ const size_t buf_size = range_size * k_size_row;
1355
+ io.write_tensor(layer.k, range.first * k_size_row, buf_size);
1356
+ }
1357
+ }
1358
+
1359
+ if (!v_trans) {
1360
+ for (const auto & layer : layers) {
1361
+ const uint32_t il = layer.il;
1362
+
1363
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1364
+
1365
+ // Write value type
1366
+ const int32_t v_type_i = (int32_t)layer.v->type;
1367
+ io.write(&v_type_i, sizeof(v_type_i));
1368
+
1369
+ // Write row size of value
1370
+ const uint64_t v_size_row = lm_ggml_row_size(layer.v->type, n_embd_v_gqa);
1371
+ io.write(&v_size_row, sizeof(v_size_row));
1372
+
1373
+ // Read each range of cells of v_size length each into tmp_buf and write out
1374
+ for (const auto & range : cell_ranges) {
1375
+ const size_t range_size = range.second - range.first;
1376
+ const size_t buf_size = range_size * v_size_row;
1377
+ io.write_tensor(layer.v, range.first * v_size_row, buf_size);
1378
+ }
1379
+ }
1380
+ } else {
1381
+ // When v is transposed, we also need the element size and get the element ranges from each row
1382
+ const uint32_t kv_size = size;
1383
+
1384
+ for (const auto & layer : layers) {
1385
+ const uint32_t il = layer.il;
1386
+
1387
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1388
+
1389
+ // Write value type
1390
+ const int32_t v_type_i = (int32_t)layer.v->type;
1391
+ io.write(&v_type_i, sizeof(v_type_i));
1392
+
1393
+ // Write element size
1394
+ const uint32_t v_size_el = lm_ggml_type_size(layer.v->type);
1395
+ io.write(&v_size_el, sizeof(v_size_el));
1396
+
1397
+ // Write GQA embedding size
1398
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1399
+
1400
+ // For each row, we get the element values of each cell
1401
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1402
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1403
+ for (const auto & range : cell_ranges) {
1404
+ const size_t range_size = range.second - range.first;
1405
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1406
+ const size_t buf_size = range_size * v_size_el;
1407
+ io.write_tensor(layer.v, src_offset, buf_size);
1408
+ }
1409
+ }
1410
+ }
1411
+ }
1412
+ }
1413
+
1414
+ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1415
+ if (dest_seq_id != -1) {
1416
+ // single sequence
1417
+
1418
+ seq_rm(dest_seq_id, -1, -1);
1419
+
1420
+ llama_sbatch sbatch;
1421
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1422
+
1423
+ batch.n_tokens = cell_count;
1424
+
1425
+ for (uint32_t i = 0; i < cell_count; ++i) {
1426
+ llama_pos pos;
1427
+ uint32_t n_seq_id;
1428
+
1429
+ io.read_to(&pos, sizeof(pos));
1430
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1431
+
1432
+ if (n_seq_id != 0) {
1433
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1434
+ return false;
1435
+ }
1436
+
1437
+ batch.pos[i] = pos;
1438
+ batch.n_seq_id[i] = 1;
1439
+ batch.seq_id[i] = &dest_seq_id;
1440
+ }
1441
+
1442
+ if (!find_slot(batch)) {
1443
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1444
+ return false;
1445
+ }
1446
+
1447
+ commit();
1448
+
1449
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
1450
+ // Assume that this is one contiguous block of cells
1451
+ LM_GGML_ASSERT(head + cell_count <= size);
1452
+ LM_GGML_ASSERT(cells[head].pos == batch.pos[0]);
1453
+ LM_GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
1454
+ LM_GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
1455
+ LM_GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
1456
+ } else {
1457
+ // whole KV cache restore
1458
+
1459
+ if (cell_count > size) {
1460
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1461
+ return false;
1462
+ }
1463
+
1464
+ clear();
1465
+
1466
+ for (uint32_t i = 0; i < cell_count; ++i) {
1467
+ kv_cell & cell = cells[i];
1468
+
1469
+ llama_pos pos;
1470
+ uint32_t n_seq_id;
1471
+
1472
+ io.read_to(&pos, sizeof(pos));
1473
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1474
+
1475
+ cell.pos = pos;
1476
+
1477
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1478
+ llama_seq_id seq_id;
1479
+ io.read_to(&seq_id, sizeof(seq_id));
1480
+
1481
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
1482
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
1483
+ return false;
1484
+ }
1485
+
1486
+ cell.seq_id.insert(seq_id);
1487
+ }
1488
+ }
1489
+
1490
+ head = 0;
1491
+ used = cell_count;
1492
+ }
1493
+
1494
+ return true;
1495
+ }
1496
+
1497
+ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1498
+ uint32_t v_trans;
1499
+ uint32_t n_layer;
1500
+
1501
+ io.read_to(&v_trans, sizeof(v_trans));
1502
+ io.read_to(&n_layer, sizeof(n_layer));
1503
+
1504
+ if (n_layer != layers.size()) {
1505
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
1506
+ return false;
1507
+ }
1508
+ if (cell_count > size) {
1509
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1510
+ return false;
1511
+ }
1512
+ if (this->v_trans != (bool) v_trans) {
1513
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1514
+ return false;
1515
+ }
1516
+
1517
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1518
+ for (const auto & layer : layers) {
1519
+ const uint32_t il = layer.il;
1520
+
1521
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1522
+
1523
+ // Read type of key
1524
+ int32_t k_type_i_ref;
1525
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1526
+ const int32_t k_type_i = (int32_t) layer.k->type;
1527
+ if (k_type_i != k_type_i_ref) {
1528
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1529
+ return false;
1530
+ }
1531
+
1532
+ // Read row size of key
1533
+ uint64_t k_size_row_ref;
1534
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1535
+ const size_t k_size_row = lm_ggml_row_size(layer.k->type, n_embd_k_gqa);
1536
+ if (k_size_row != k_size_row_ref) {
1537
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1538
+ return false;
1539
+ }
1540
+
1541
+ if (cell_count) {
1542
+ // Read and set the keys for the whole cell range
1543
+ lm_ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1544
+ }
1545
+ }
1546
+
1547
+ if (!this->v_trans) {
1548
+ for (const auto & layer : layers) {
1549
+ const uint32_t il = layer.il;
1550
+
1551
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1552
+
1553
+ // Read type of value
1554
+ int32_t v_type_i_ref;
1555
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1556
+ const int32_t v_type_i = (int32_t)layer.v->type;
1557
+ if (v_type_i != v_type_i_ref) {
1558
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1559
+ return false;
1560
+ }
1561
+
1562
+ // Read row size of value
1563
+ uint64_t v_size_row_ref;
1564
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1565
+ const size_t v_size_row = lm_ggml_row_size(layer.v->type, n_embd_v_gqa);
1566
+ if (v_size_row != v_size_row_ref) {
1567
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1568
+ return false;
1569
+ }
1570
+
1571
+ if (cell_count) {
1572
+ // Read and set the values for the whole cell range
1573
+ lm_ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1574
+ }
1575
+ }
1576
+ } else {
1577
+ // For each layer, read the values for each cell (transposed)
1578
+ for (const auto & layer : layers) {
1579
+ const uint32_t il = layer.il;
1580
+
1581
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1582
+
1583
+ // Read type of value
1584
+ int32_t v_type_i_ref;
1585
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1586
+ const int32_t v_type_i = (int32_t)layer.v->type;
1587
+ if (v_type_i != v_type_i_ref) {
1588
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1589
+ return false;
1590
+ }
1591
+
1592
+ // Read element size of value
1593
+ uint32_t v_size_el_ref;
1594
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1595
+ const size_t v_size_el = lm_ggml_type_size(layer.v->type);
1596
+ if (v_size_el != v_size_el_ref) {
1597
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1598
+ return false;
1599
+ }
1600
+
1601
+ // Read GQA embedding size
1602
+ uint32_t n_embd_v_gqa_ref;
1603
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1604
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1605
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1606
+ return false;
1607
+ }
1608
+
1609
+ if (cell_count) {
1610
+ // For each row in the transposed matrix, read the values for the whole cell range
1611
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1612
+ const size_t dst_offset = (head + j * size) * v_size_el;
1613
+ lm_ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1614
+ }
1615
+ }
1616
+ }
1617
+ }
1618
+
1619
+ return true;
1620
+ }
1621
+
1622
+ //
1623
+ // llama_kv_cache_unified_iswa
1624
+ //
1625
+
1626
+ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
1627
+ const llama_model & model,
1628
+ lm_ggml_type type_k,
1629
+ lm_ggml_type type_v,
1630
+ bool v_trans,
1631
+ bool offload,
1632
+ bool swa_full,
1633
+ uint32_t kv_size,
1634
+ uint32_t n_seq_max,
1635
+ uint32_t n_batch,
1636
+ uint32_t n_pad) : hparams(model.hparams) {
1637
+ llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
1638
+ llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
1639
+
1640
+ const uint32_t size_base = kv_size;
1641
+
1642
+ uint32_t size_swa = std::min(size_base, LM_GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
1643
+
1644
+ // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
1645
+ if (swa_full) {
1646
+ LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
1647
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
1648
+
1649
+ size_swa = size_base;
1650
+ do_prune = false;
1651
+ }
1652
+
1653
+ LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
1654
+
1655
+ kv_base = std::make_unique<llama_kv_cache_unified>(
1656
+ model, std::move(filter_base), type_k, type_v,
1657
+ v_trans, offload, size_base, n_seq_max, n_pad,
1658
+ 0, LLAMA_SWA_TYPE_NONE);
1659
+
1660
+ LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
1661
+
1662
+ kv_swa = std::make_unique<llama_kv_cache_unified>(
1663
+ model, std::move(filter_swa), type_k, type_v,
1664
+ v_trans, offload, size_swa, n_seq_max, n_pad,
1665
+ hparams.n_swa, hparams.swa_type);
1666
+ }
1667
+
1668
+ void llama_kv_cache_unified_iswa::clear() {
1669
+ kv_base->clear();
1670
+ kv_swa ->clear();
1671
+ }
1672
+
1673
+ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
1674
+ bool res = true;
1675
+
1676
+ res = res & kv_base->seq_rm(seq_id, p0, p1);
1677
+ res = res & kv_swa ->seq_rm(seq_id, p0, p1);
1678
+
1679
+ return res;
1680
+ }
1681
+
1682
+ void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
1683
+ kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1684
+ kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1685
+ }
1686
+
1687
+ void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
1688
+ kv_base->seq_keep(seq_id);
1689
+ kv_swa ->seq_keep(seq_id);
1690
+ }
1691
+
1692
+ void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
1693
+ kv_base->seq_add(seq_id, p0, p1, delta);
1694
+ kv_swa ->seq_add(seq_id, p0, p1, delta);
1695
+ }
1696
+
1697
+ void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
1698
+ kv_base->seq_div(seq_id, p0, p1, d);
1699
+ kv_swa ->seq_div(seq_id, p0, p1, d);
1700
+ }
1701
+
1702
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
1703
+ // the base cache is a superset of the SWA cache, so we can just check the SWA cache
1704
+ return kv_swa->seq_pos_min(seq_id);
1705
+ }
1706
+
1707
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
1708
+ return kv_swa->seq_pos_max(seq_id);
1709
+ }
1710
+
1711
+ void llama_kv_cache_unified_iswa::restore() {
1712
+ kv_base->restore();
1713
+ kv_swa ->restore();
1714
+ }
1715
+
1716
+ void llama_kv_cache_unified_iswa::commit() {
1717
+ kv_base->commit();
1718
+ kv_swa ->commit();
1719
+
1720
+ // slide the attention window, forgetting/pruning old tokens that are outside the window
1721
+ if (do_prune) {
1722
+ for (const auto & [seq_id, entry] : pending.pos) {
1723
+ kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
1724
+ }
1725
+
1726
+ }
1727
+
1728
+ pending.clear();
1729
+ }
1730
+
1731
+ bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
1732
+ bool res = true;
1733
+
1734
+ res = res & kv_base->update(lctx);
1735
+ res = res & kv_swa ->update(lctx);
1736
+
1737
+ return res;
1738
+ }
1739
+
1740
+ void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
1741
+ kv_base->defrag_sched(thold);
1742
+ kv_swa ->defrag_sched(thold);
1743
+ }
1744
+
1745
+ void llama_kv_cache_unified_iswa::set_full() {
1746
+ kv_base->set_full();
1747
+ kv_swa ->set_full();
1748
+ }
1749
+
1750
+ llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
1751
+ pending.clear();
1752
+
1753
+ if (do_prune) {
1754
+ for (int i = 0; i < batch.n_tokens; ++i) {
1755
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
1756
+ const llama_seq_id seq_id = batch.seq_id[i][s];
1757
+ const llama_pos pos = batch.pos[i];
1758
+
1759
+ if (pending.pos.find(seq_id) == pending.pos.end()) {
1760
+ pending.pos[seq_id].pmin = pos;
1761
+ pending.pos[seq_id].pmax = pos;
1762
+ } else {
1763
+ pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
1764
+ pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
1765
+ }
1766
+ }
1767
+ }
1768
+ }
1769
+
1770
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
1771
+ }
1772
+
1773
+ llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
1774
+ LM_GGML_UNUSED(embd_pooled);
1775
+ return sbatch.split_simple(n_ubatch);
1776
+ }
1777
+
1778
+ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
1779
+ bool res = true;
1780
+
1781
+ res = res & kv_base->find_slot(batch);
1782
+ res = res & kv_swa ->find_slot(batch);
1783
+
1784
+ return res;
1785
+ }
1786
+
1787
+ bool llama_kv_cache_unified_iswa::get_can_shift() const {
1788
+ return kv_base->get_size() == kv_swa->get_size();
1789
+ }
1790
+
1791
+ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1792
+ kv_base->state_write(io, seq_id);
1793
+ kv_swa ->state_write(io, seq_id);
1794
+ }
1795
+
1796
+ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1797
+ kv_base->state_read(io, seq_id);
1798
+ kv_swa ->state_read(io, seq_id);
1799
+ }
1800
+
1801
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
1802
+ return kv_base.get();
1803
+ }
1804
+
1805
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
1806
+ return kv_swa.get();
1807
+ }
1808
+
1809
+ //
1810
+ // llama_kv_cache_recurrent
1811
+ //
1812
+
1813
+ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
1814
+ const llama_model & model,
1815
+ lm_ggml_type type_k,
1816
+ lm_ggml_type type_v,
1817
+ bool offload,
1818
+ uint32_t kv_size,
1819
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
1820
+ const int32_t n_layer = hparams.n_layer;
1821
+
1822
+ LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
1823
+ __func__, kv_size, n_seq_max, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer);
1824
+
1825
+ head = 0;
1826
+ size = kv_size;
1827
+ used = 0;
1828
+
1829
+ cells.clear();
1830
+ cells.resize(kv_size);
1831
+
1832
+ // create a context for each buffer type
1833
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
1834
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
1835
+ auto it = ctx_map.find(buft);
1836
+ if (it == ctx_map.end()) {
1837
+ lm_ggml_init_params params = {
1838
+ /*.mem_size =*/ size_t(2u*n_layer*lm_ggml_tensor_overhead()),
1839
+ /*.mem_buffer =*/ NULL,
1840
+ /*.no_alloc =*/ true,
1841
+ };
1842
+
1843
+ lm_ggml_context * ctx = lm_ggml_init(params);
1844
+ if (!ctx) {
1845
+ return nullptr;
1846
+ }
1847
+
1848
+ ctx_map[buft] = ctx;
1849
+ ctxs.emplace_back(ctx);
1850
+
1851
+ return ctx;
1852
+ }
1853
+
1854
+ return it->second;
1855
+ };
1856
+
1857
+ k_l.reserve(n_layer);
1858
+ v_l.reserve(n_layer);
1859
+
1860
+ for (int i = 0; i < n_layer; i++) {
1861
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
1862
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
1863
+
1864
+ const char * dev_name = "CPU";
1865
+
1866
+ lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_cpu_buffer_type();
1867
+
1868
+ if (offload) {
1869
+ auto * dev = model.dev_layer(i);
1870
+ buft = lm_ggml_backend_dev_buffer_type(dev);
1871
+
1872
+ dev_name = lm_ggml_backend_dev_name(dev);
1873
+ }
1874
+
1875
+ LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
1876
+
1877
+ lm_ggml_context * ctx = ctx_for_buft(buft);
1878
+ if (!ctx) {
1879
+ throw std::runtime_error("failed to create ggml context for kv cache");
1880
+ }
1881
+
1882
+ lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
1883
+ lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
1884
+ lm_ggml_format_name(k, "cache_k_l%d", i);
1885
+ lm_ggml_format_name(v, "cache_v_l%d", i);
1886
+ k_l.push_back(k);
1887
+ v_l.push_back(v);
1888
+ }
1889
+
1890
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
1891
+ for (auto it : ctx_map) {
1892
+ auto * buft = it.first;
1893
+ auto * ctx = it.second;
1894
+
1895
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1896
+ if (!buf) {
1897
+ throw std::runtime_error("failed to allocate buffer for kv cache");
1898
+ }
1899
+ lm_ggml_backend_buffer_clear(buf, 0);
1900
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
1901
+ bufs.emplace_back(buf);
1902
+ }
1903
+
1904
+ {
1905
+ const size_t memory_size_k = size_k_bytes();
1906
+ const size_t memory_size_v = size_v_bytes();
1907
+
1908
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
1909
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
1910
+ lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
1911
+ lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
1912
+ }
1913
+ }
1914
+
1915
+ void llama_kv_cache_recurrent::clear() {
1916
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
1917
+ cells[i].pos = -1;
1918
+ cells[i].seq_id.clear();
1919
+ cells[i].src = -1;
1920
+ cells[i].tail = -1;
1921
+ }
1922
+ head = 0;
1923
+ used = 0;
1924
+
1925
+ for (auto & buf : bufs) {
1926
+ lm_ggml_backend_buffer_clear(buf.get(), 0);
1927
+ }
1928
+ }
1929
+
1930
+ bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
1931
+ uint32_t new_head = size;
1932
+
1933
+ if (p0 < 0) {
1934
+ p0 = 0;
1935
+ }
1936
+
1937
+ if (p1 < 0) {
1938
+ p1 = std::numeric_limits<llama_pos>::max();
1939
+ }
1940
+
1941
+ // models like Mamba or RWKV can't have a state partially erased
1942
+ if (seq_id >= (int64_t) size) {
1943
+ // could be fatal
1944
+ return false;
1945
+ }
1946
+ if (0 <= seq_id) {
1947
+ int32_t & tail_id = cells[seq_id].tail;
1948
+ if (tail_id >= 0) {
1949
+ const kv_cell & cell = cells[tail_id];
1950
+ // partial intersection is invalid
1951
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
1952
+ return false;
1953
+ }
1954
+ // invalidate tails which will be cleared
1955
+ if (p0 <= cell.pos && cell.pos < p1) {
1956
+ tail_id = -1;
1957
+ }
1958
+ }
1959
+ } else {
1960
+ // seq_id is negative, then the range should include everything or nothing
1961
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
1962
+ return false;
1963
+ }
1964
+ }
1965
+
1966
+ for (uint32_t i = 0; i < size; ++i) {
1967
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
1968
+ if (seq_id < 0) {
1969
+ cells[i].seq_id.clear();
1970
+ } else if (cells[i].has_seq_id(seq_id)) {
1971
+ cells[i].seq_id.erase(seq_id);
1972
+ } else {
1973
+ continue;
1974
+ }
1975
+ if (cells[i].is_empty()) {
1976
+ // keep count of the number of used cells
1977
+ if (cells[i].pos >= 0) {
1978
+ used--;
1979
+ }
1980
+ cells[i].pos = -1;
1981
+ cells[i].src = -1;
1982
+ if (new_head == size) {
1983
+ new_head = i;
1984
+ }
1985
+ }
1986
+ }
1987
+ }
1988
+
1989
+ // If we freed up a slot, set head to it so searching can start there.
1990
+ if (new_head != size && new_head < head) {
1991
+ head = new_head;
1992
+ }
1993
+
1994
+ return true;
1995
+ }
1996
+
1997
+ void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
1998
+ if (seq_id_src == seq_id_dst) {
1999
+ return;
2000
+ }
2001
+
2002
+ if (p0 < 0) {
2003
+ p0 = 0;
2004
+ }
2005
+
2006
+ if (p1 < 0) {
2007
+ p1 = std::numeric_limits<llama_pos>::max();
2008
+ }
2009
+
2010
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
2011
+ kv_cell & tail_src = cells[seq_id_src];
2012
+ kv_cell & tail_dst = cells[seq_id_dst];
2013
+ if (tail_dst.tail >= 0) {
2014
+ // clear destination seq_id if it wasn't empty
2015
+ kv_cell & cell_dst = cells[tail_dst.tail];
2016
+
2017
+ cell_dst.seq_id.erase(seq_id_dst);
2018
+ tail_dst.tail = -1;
2019
+ if (cell_dst.seq_id.empty()) {
2020
+ cell_dst.pos = -1;
2021
+ cell_dst.src = -1;
2022
+ used -= 1;
2023
+ }
2024
+ }
2025
+ if (tail_src.tail >= 0) {
2026
+ kv_cell & cell_src = cells[tail_src.tail];
2027
+
2028
+ cell_src.seq_id.insert(seq_id_dst);
2029
+ tail_dst.tail = tail_src.tail;
2030
+ }
2031
+ }
2032
+ }
2033
+
2034
+ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
2035
+ uint32_t new_head = size;
2036
+
2037
+ for (uint32_t i = 0; i < size; ++i) {
2038
+ if ((llama_seq_id) i != seq_id) {
2039
+ cells[i].tail = -1;
2040
+ }
2041
+
2042
+ if (!cells[i].has_seq_id(seq_id)) {
2043
+ if (cells[i].pos >= 0) {
2044
+ used--;
2045
+ }
2046
+
2047
+ cells[i].pos = -1;
2048
+ cells[i].src = -1;
2049
+ cells[i].seq_id.clear();
2050
+
2051
+ if (new_head == size){
2052
+ new_head = i;
2053
+ }
2054
+ } else {
2055
+ cells[i].seq_id.clear();
2056
+ cells[i].seq_id.insert(seq_id);
2057
+ }
2058
+ }
2059
+
2060
+ // If we freed up a slot, set head to it so searching can start there.
2061
+ if (new_head != size && new_head < head) {
2062
+ head = new_head;
2063
+ }
2064
+ }
2065
+
2066
+ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
2067
+ if (delta == 0) {
2068
+ return;
2069
+ }
2070
+
2071
+ if (p0 < 0) {
2072
+ p0 = 0;
2073
+ }
2074
+
2075
+ if (p1 < 0) {
2076
+ p1 = std::numeric_limits<llama_pos>::max();
2077
+ }
2078
+
2079
+ // If there is no range then return early to avoid looping over the
2080
+ if (p0 == p1) {
2081
+ return;
2082
+ }
2083
+
2084
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
2085
+ if (0 <= seq_id && seq_id < (int64_t) size) {
2086
+ const int32_t tail_id = cells[seq_id].tail;
2087
+ if (tail_id >= 0) {
2088
+ kv_cell & cell = cells[tail_id];
2089
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2090
+ cell.pos += delta;
2091
+ }
2092
+ }
2093
+ }
2094
+ }
2095
+
2096
+ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
2097
+ if (d == 1) {
2098
+ return;
2099
+ }
2100
+
2101
+ if (p0 < 0) {
2102
+ p0 = 0;
2103
+ }
2104
+
2105
+ if (p1 < 0) {
2106
+ p1 = std::numeric_limits<llama_pos>::max();
711
2107
  }
712
2108
 
713
- for (uint32_t s = 0; s < n_seqs; s++) {
714
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
715
- uint32_t k = s*n_seq_tokens + i;
716
- cells[head + k].pos = ubatch.pos[k];
2109
+ // If there is no range then return early to avoid looping over the cache.
2110
+ if (p0 == p1) {
2111
+ return;
2112
+ }
717
2113
 
718
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
719
- cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
2114
+ // for Mamba-like or RWKV models, only the pos needs to be changed
2115
+ if (0 <= seq_id && seq_id < (int64_t) size) {
2116
+ const int32_t tail_id = cells[seq_id].tail;
2117
+ if (tail_id >= 0) {
2118
+ kv_cell & cell = cells[tail_id];
2119
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
2120
+ cell.pos /= d;
720
2121
  }
721
2122
  }
722
2123
  }
2124
+ }
723
2125
 
724
- used += n_tokens;
2126
+ llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
2127
+ llama_pos result = std::numeric_limits<llama_pos>::max();
725
2128
 
726
- pending.ranges.push_back({head, head + n_tokens});
2129
+ for (uint32_t i = 0; i < size; ++i) {
2130
+ if (cells[i].has_seq_id(seq_id)) {
2131
+ result = std::min(result, cells[i].pos);
2132
+ }
2133
+ }
727
2134
 
728
- return true;
729
- }
2135
+ if (result == std::numeric_limits<llama_pos>::max()) {
2136
+ result = -1;
2137
+ }
730
2138
 
731
- uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
732
- // the FA kernels require padding to avoid extra runtime boundary checks
733
- return cparams.flash_attn ? 256u : 32u;
2139
+ return result;
734
2140
  }
735
2141
 
736
- uint32_t llama_kv_cache_unified::cell_max() const {
737
- for (uint32_t i = size; i > 0; --i) {
738
- const llama_kv_cell & cell = cells[i - 1];
2142
+ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
2143
+ llama_pos result = -1;
739
2144
 
740
- if (cell.pos >= 0 && !cell.is_empty()) {
741
- return i;
2145
+ for (uint32_t i = 0; i < size; ++i) {
2146
+ if (cells[i].has_seq_id(seq_id)) {
2147
+ result = std::max(result, cells[i].pos);
742
2148
  }
743
2149
  }
744
2150
 
745
- return 0;
2151
+ return result;
746
2152
  }
747
2153
 
748
- size_t llama_kv_cache_unified::size_k_bytes() const {
749
- size_t size_k_bytes = 0;
750
-
751
- for (const auto & k : k_l) {
752
- size_k_bytes += lm_ggml_nbytes(k);
2154
+ void llama_kv_cache_recurrent::restore() {
2155
+ if (pending.ranges.empty()) {
2156
+ return;
753
2157
  }
754
2158
 
755
- return size_k_bytes;
2159
+ seq_rm(-1, -1, -1);
756
2160
  }
757
2161
 
758
- size_t llama_kv_cache_unified::size_v_bytes() const {
759
- size_t size_v_bytes = 0;
2162
+ void llama_kv_cache_recurrent::commit() {
2163
+ pending.ranges.clear();
2164
+ }
760
2165
 
761
- for (const auto & v : v_l) {
762
- size_v_bytes += lm_ggml_nbytes(v);
763
- }
2166
+ bool llama_kv_cache_recurrent::update(llama_context & ctx) {
2167
+ LM_GGML_UNUSED(ctx);
2168
+ return false;
2169
+ }
764
2170
 
765
- return size_v_bytes;
2171
+ void llama_kv_cache_recurrent::defrag_sched(float thold) {
2172
+ LM_GGML_UNUSED(thold);
2173
+ // noop
766
2174
  }
767
2175
 
768
- bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
769
- const uint32_t n_layer = hparams.n_layer;
2176
+ void llama_kv_cache_recurrent::set_full() {
2177
+ n = size;
2178
+ head = 0;
2179
+ }
770
2180
 
771
- const uint32_t n_kv = cell_max();
772
- const uint32_t n_used = used;
2181
+ llama_sbatch llama_kv_cache_recurrent::sbatch_init(
2182
+ const llama_batch & batch,
2183
+ bool logits_all) {
2184
+ return llama_sbatch(batch, hparams.n_embd, false, logits_all);
2185
+ }
773
2186
 
774
- assert(n_used <= n_kv);
2187
+ llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
2188
+ if (embd_pooled) {
2189
+ // Pooled embeddings cannot be split across ubatches (yet)
2190
+ return sbatch.split_seq(n_ubatch);
2191
+ }
775
2192
 
776
- //const int64_t t_start = lm_ggml_time_us();
2193
+ return sbatch.split_equal(n_ubatch);
2194
+ }
777
2195
 
778
- // number of cells moved
779
- uint32_t n_moves = 0;
2196
+ bool llama_kv_cache_recurrent::find_slot(
2197
+ const llama_ubatch & ubatch) {
2198
+ const uint32_t n_tokens = ubatch.n_tokens;
2199
+ const uint32_t n_seqs = ubatch.n_seqs;
780
2200
 
781
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
782
- // - source view, destination view, copy operation
783
- // - x2 for keys and values
784
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
785
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
786
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
2201
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
787
2202
 
788
- // determine which KV cells to move where
789
- //
790
- // cell i moves to ids[i]
791
- //
792
- // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
793
- //
794
- auto & ids = defrag_info.ids;
2203
+ // if we have enough unused cells before the current head ->
2204
+ // better to start searching from the beginning of the cache, hoping to fill it
2205
+ if (head > used + 2*n_tokens) {
2206
+ head = 0;
2207
+ }
795
2208
 
796
- ids.clear();
797
- ids.resize(n_kv, n_kv);
2209
+ // For recurrent state architectures (like Mamba or RWKV),
2210
+ // each cache cell can store the state for a whole sequence.
2211
+ // A slot should be always be contiguous.
798
2212
 
799
- for (uint32_t i0 = 0; i0 < n_used; ++i0) {
800
- const auto & cell0 = cells[i0];
2213
+ // can only process batches with an equal number of new tokens in each sequence
2214
+ LM_GGML_ASSERT(ubatch.equal_seqs);
801
2215
 
802
- if (!cell0.is_empty()) {
803
- ids[i0] = i0;
2216
+ int32_t min = size - 1;
2217
+ int32_t max = 0;
804
2218
 
805
- continue;
2219
+ // everything should fit if all seq_ids are smaller than the max
2220
+ for (uint32_t s = 0; s < n_seqs; ++s) {
2221
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
2222
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
2223
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
2224
+
2225
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
2226
+ // too big seq_id
2227
+ // TODO: would it be possible to resize the cache instead?
2228
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
2229
+ return false;
2230
+ }
2231
+ if (j > 0) {
2232
+ kv_cell & seq = cells[seq_id];
2233
+ if (seq.tail >= 0) {
2234
+ kv_cell & cell = cells[seq.tail];
2235
+ // clear cells from seq_ids that become shared
2236
+ // (should not normally happen, but let's handle it anyway)
2237
+ cell.seq_id.erase(seq_id);
2238
+ seq.tail = -1;
2239
+ if (cell.seq_id.empty()) {
2240
+ cell.pos = -1;
2241
+ cell.src = -1;
2242
+ used -= 1;
2243
+ }
2244
+ }
2245
+ }
806
2246
  }
2247
+ }
807
2248
 
808
- // found a hole - fill it with data from the end of the cache
2249
+ #ifndef NDEBUG
2250
+ {
2251
+ std::vector<int32_t> tails_verif;
2252
+ tails_verif.assign(size, -1);
2253
+ for (uint32_t i = 0; i < size; ++i) {
2254
+ kv_cell & cell = cells[i];
2255
+ for (llama_seq_id seq_id : cell.seq_id) {
2256
+ if (tails_verif[seq_id] != -1) {
2257
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
2258
+ }
2259
+ tails_verif[seq_id] = i;
2260
+ }
2261
+ }
2262
+ for (uint32_t i = 0; i < size; ++i) {
2263
+ if (tails_verif[i] != cells[i].tail) {
2264
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
2265
+ }
2266
+ }
2267
+ }
2268
+ #endif
809
2269
 
810
- uint32_t nh = 1;
2270
+ // find next empty cell
2271
+ uint32_t next_empty_cell = head;
811
2272
 
812
- // determine the size of the hole
813
- while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
814
- nh++;
2273
+ for (uint32_t i = 0; i < size; ++i) {
2274
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
2275
+ kv_cell & cell = cells[next_empty_cell];
2276
+ if (cell.is_empty()) { break; }
2277
+ next_empty_cell += 1;
2278
+ }
2279
+
2280
+ // find usable cell range
2281
+ for (uint32_t s = 0; s < n_seqs; ++s) {
2282
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
2283
+ kv_cell & seq_meta = cells[seq_id];
2284
+ bool has_cell = false;
2285
+ if (seq_meta.tail >= 0) {
2286
+ kv_cell & cell = cells[seq_meta.tail];
2287
+ LM_GGML_ASSERT(cell.has_seq_id(seq_id));
2288
+ // does this seq_id "own" the cell?
2289
+ if (cell.seq_id.size() == 1) { has_cell = true; }
2290
+ }
2291
+ if (!has_cell) {
2292
+ kv_cell & empty_cell = cells[next_empty_cell];
2293
+ LM_GGML_ASSERT(empty_cell.is_empty());
2294
+ // copy old tail into the empty cell
2295
+ if (seq_meta.tail >= 0) {
2296
+ kv_cell & orig_cell = cells[seq_meta.tail];
2297
+ empty_cell.pos = orig_cell.pos;
2298
+ empty_cell.src = orig_cell.src;
2299
+ orig_cell.seq_id.erase(seq_id);
2300
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
2301
+ }
2302
+ seq_meta.tail = next_empty_cell;
2303
+ // find next empty cell
2304
+ if (s + 1 < n_seqs) {
2305
+ next_empty_cell += 1;
2306
+ for (uint32_t i = 0; i < size; ++i) {
2307
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
2308
+ kv_cell & cell = cells[next_empty_cell];
2309
+ if (cell.is_empty()) { break; }
2310
+ next_empty_cell += 1;
2311
+ }
2312
+ }
815
2313
  }
2314
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
2315
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
2316
+ }
816
2317
 
817
- uint32_t nf = 0;
818
- uint32_t is = n_kv - 1;
2318
+ // gather and re-order
2319
+ for (uint32_t s = 0; s < n_seqs; ++s) {
2320
+ int32_t dst_id = s + min;
2321
+ int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
2322
+ if (dst_id != src_id) {
2323
+ kv_cell & dst_cell = cells[dst_id];
2324
+ kv_cell & src_cell = cells[src_id];
819
2325
 
820
- // starting from the end, find nh non-empty cells
821
- for (; is > i0; --is) {
822
- const auto & cell1 = cells[is];
2326
+ std::swap(dst_cell.pos, src_cell.pos);
2327
+ std::swap(dst_cell.src, src_cell.src);
2328
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
823
2329
 
824
- if (cell1.is_empty() || ids[is] != n_kv) {
825
- continue;
2330
+ // swap tails (assuming they NEVER overlap)
2331
+ for (const llama_seq_id seq_id : src_cell.seq_id) {
2332
+ cells[seq_id].tail = src_id;
2333
+ }
2334
+ for (const llama_seq_id seq_id : dst_cell.seq_id) {
2335
+ cells[seq_id].tail = dst_id;
826
2336
  }
2337
+ }
2338
+ }
827
2339
 
828
- // non-empty cell which is not yet moved
829
- nf++;
2340
+ // update the pos of the used seqs
2341
+ for (uint32_t s = 0; s < n_seqs; ++s) {
2342
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
2343
+ int32_t cell_id = s + min;
2344
+ kv_cell & cell = cells[cell_id];
830
2345
 
831
- if (nf == nh) {
832
- break;
833
- }
2346
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
2347
+ // What should happen when the pos backtracks or skips a value?
2348
+ // Clearing the state mid-batch would require special-casing which isn't done.
2349
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
2350
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
2351
+ }
2352
+ cell.pos = last_pos;
2353
+ cell.seq_id.clear();
2354
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
2355
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
2356
+ cell.seq_id.insert(seq_id);
2357
+ cells[seq_id].tail = cell_id;
834
2358
  }
2359
+ }
835
2360
 
836
- // this can only happen if `n_used` is not accurate, which would be a bug
837
- LM_GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
2361
+ // allow getting the range of used cells, from head to head + n
2362
+ head = min;
2363
+ n = max - min + 1;
2364
+ used = std::count_if(cells.begin(), cells.end(),
2365
+ [](const kv_cell & cell){ return !cell.is_empty(); });
838
2366
 
839
- nf = 0;
2367
+ // sanity check
2368
+ return n >= n_seqs;
2369
+ }
840
2370
 
841
- uint32_t i1 = is;
2371
+ bool llama_kv_cache_recurrent::get_can_shift() const {
2372
+ return false;
2373
+ }
842
2374
 
843
- // are we moving a continuous block of memory?
844
- bool cont = false;
2375
+ int32_t llama_kv_cache_recurrent::s_copy(int i) const {
2376
+ const uint32_t cell_id = i + head;
845
2377
 
846
- // should we stop searching for the next move?
847
- bool stop = false;
2378
+ //////////////////////////////////////////////
2379
+ // TODO: this should not mutate the KV cache !
2380
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
848
2381
 
849
- // go back and move the nf cells to the hole
850
- for (; i1 < n_kv; ++i1) {
851
- auto & cell1 = cells[i1];
2382
+ // prevent out-of-bound sources
2383
+ if (cell.src < 0 || (uint32_t) cell.src >= size) {
2384
+ cell.src = cell_id;
2385
+ }
852
2386
 
853
- if (cell1.is_empty() || ids[i1] != n_kv) {
854
- if (n_moves == max_moves) {
855
- stop = true;
856
- break;
857
- }
2387
+ int32_t res = cell.src;
858
2388
 
859
- cont = false;
860
- continue;
861
- }
2389
+ // TODO: do not mutate the KV cache
2390
+ // ensure copy only happens once
2391
+ if (cell.src != (int32_t) cell_id) {
2392
+ cell.src = cell_id;
2393
+ }
862
2394
 
863
- // this cell goes to (i0 + nf)
864
- ids[i1] = i0 + nf;
2395
+ return res;
2396
+ }
865
2397
 
866
- // move the cell meta data
867
- cells[i0 + nf] = cell1;
2398
+ float llama_kv_cache_recurrent::s_mask(int i) const {
2399
+ const uint32_t cell_id = i + head;
868
2400
 
869
- // clear the old cell and move the head there
870
- cell1 = llama_kv_cell();
871
- head = n_used;
2401
+ //////////////////////////////////////////////
2402
+ // TODO: this should not mutate the KV cache !
2403
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
872
2404
 
873
- if (!cont) {
874
- n_moves++;
875
- cont = true;
876
- }
2405
+ float res = (float) (cell.src >= 0);
877
2406
 
878
- nf++;
2407
+ // only clear once
2408
+ if (cell.src < 0) {
2409
+ cell.src = cell_id;
2410
+ }
879
2411
 
880
- if (nf == nh) {
881
- break;
882
- }
883
- }
2412
+ return res;
2413
+ }
884
2414
 
885
- if (stop || n_moves == max_moves) {
886
- break;
2415
+ uint32_t llama_kv_cache_recurrent::cell_max() const {
2416
+ for (uint32_t i = size; i > 0; --i) {
2417
+ const kv_cell & cell = cells[i - 1];
2418
+
2419
+ if (cell.pos >= 0 && !cell.is_empty()) {
2420
+ return i;
887
2421
  }
2422
+ }
888
2423
 
889
- //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
2424
+ return 0;
2425
+ }
890
2426
 
891
- i0 += nh - 1;
2427
+ size_t llama_kv_cache_recurrent::total_size() const {
2428
+ size_t size = 0;
2429
+ for (const auto & buf : bufs) {
2430
+ size += lm_ggml_backend_buffer_get_size(buf.get());
892
2431
  }
893
2432
 
894
- if (n_moves == 0) {
895
- return false;
2433
+ return size;
2434
+ }
2435
+
2436
+ size_t llama_kv_cache_recurrent::size_k_bytes() const {
2437
+ size_t size_k_bytes = 0;
2438
+
2439
+ for (const auto & k : k_l) {
2440
+ size_k_bytes += lm_ggml_nbytes(k);
896
2441
  }
897
2442
 
898
- LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
2443
+ return size_k_bytes;
2444
+ }
899
2445
 
900
- LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
2446
+ size_t llama_kv_cache_recurrent::size_v_bytes() const {
2447
+ size_t size_v_bytes = 0;
901
2448
 
902
- return true;
2449
+ for (const auto & v : v_l) {
2450
+ size_v_bytes += lm_ggml_nbytes(v);
2451
+ }
2452
+
2453
+ return size_v_bytes;
903
2454
  }
904
2455
 
905
- void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
2456
+ void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
906
2457
  std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
907
2458
  uint32_t cell_count = 0;
908
2459
 
@@ -940,11 +2491,12 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
940
2491
  state_write_data(io, cell_ranges);
941
2492
  }
942
2493
 
943
- void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
2494
+ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
944
2495
  uint32_t cell_count;
945
2496
  io.read_to(&cell_count, sizeof(cell_count));
946
2497
 
947
2498
  bool res = true;
2499
+
948
2500
  res = res && state_read_meta(io, cell_count, seq_id);
949
2501
  res = res && state_read_data(io, cell_count);
950
2502
 
@@ -958,7 +2510,7 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
958
2510
  }
959
2511
  }
960
2512
 
961
- void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
2513
+ void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
962
2514
  for (const auto & range : cell_ranges) {
963
2515
  for (uint32_t i = range.first; i < range.second; ++i) {
964
2516
  const auto & cell = cells[i];
@@ -977,8 +2529,8 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::
977
2529
  }
978
2530
  }
979
2531
 
980
- void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
981
- const uint32_t v_trans = this->v_trans ? 1 : 0;
2532
+ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
2533
+ const uint32_t v_trans = 0;
982
2534
  const uint32_t n_layer = hparams.n_layer;
983
2535
 
984
2536
  io.write(&v_trans, sizeof(v_trans));
@@ -1057,7 +2609,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1057
2609
  }
1058
2610
  }
1059
2611
 
1060
- bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
2612
+ bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1061
2613
  if (dest_seq_id != -1) {
1062
2614
  // single sequence
1063
2615
 
@@ -1110,7 +2662,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1110
2662
  clear();
1111
2663
 
1112
2664
  for (uint32_t i = 0; i < cell_count; ++i) {
1113
- llama_kv_cell & cell = cells[i];
2665
+ kv_cell & cell = cells[i];
1114
2666
 
1115
2667
  llama_pos pos;
1116
2668
  uint32_t n_seq_id;
@@ -1124,7 +2676,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1124
2676
  llama_seq_id seq_id;
1125
2677
  io.read_to(&seq_id, sizeof(seq_id));
1126
2678
 
1127
- // TODO: llama_kv_cache_unified should have a notion of max sequences
2679
+ // TODO: llama_kv_cache_recurrent should have a notion of max sequences
1128
2680
  //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1129
2681
  if (seq_id < 0) {
1130
2682
  //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
@@ -1134,14 +2686,12 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1134
2686
 
1135
2687
  cell.seq_id.insert(seq_id);
1136
2688
 
1137
- if (recurrent) {
1138
- int32_t & tail = cells[seq_id].tail;
1139
- if (tail != -1) {
1140
- LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
1141
- return false;
1142
- }
1143
- tail = i;
2689
+ int32_t & tail = cells[seq_id].tail;
2690
+ if (tail != -1) {
2691
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
2692
+ return false;
1144
2693
  }
2694
+ tail = i;
1145
2695
  }
1146
2696
  }
1147
2697
 
@@ -1149,18 +2699,16 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1149
2699
  used = cell_count;
1150
2700
  }
1151
2701
 
1152
- if (recurrent) {
1153
- for (uint32_t i = 0; i < cell_count; ++i) {
1154
- uint32_t cell_id = head + i;
1155
- // make sure the recurrent states will keep their restored state
1156
- cells[cell_id].src = cell_id;
1157
- }
2702
+ for (uint32_t i = 0; i < cell_count; ++i) {
2703
+ uint32_t cell_id = head + i;
2704
+ // make sure the recurrent states will keep their restored state
2705
+ cells[cell_id].src = cell_id;
1158
2706
  }
1159
2707
 
1160
2708
  return true;
1161
2709
  }
1162
2710
 
1163
- bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
2711
+ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1164
2712
  uint32_t v_trans;
1165
2713
  uint32_t n_layer;
1166
2714
  io.read_to(&v_trans, sizeof(v_trans));
@@ -1174,7 +2722,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1174
2722
  LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1175
2723
  return false;
1176
2724
  }
1177
- if (v_trans != (bool) v_trans) {
2725
+ if (false != (bool) v_trans) {
1178
2726
  LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1179
2727
  return false;
1180
2728
  }
@@ -1277,104 +2825,3 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1277
2825
 
1278
2826
  return true;
1279
2827
  }
1280
-
1281
- //
1282
- // kv cache view
1283
- //
1284
-
1285
- llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
1286
- llama_kv_cache_view result = {
1287
- /*.n_cells = */ 0,
1288
- /*.n_seq_max = */ n_seq_max,
1289
- /*.token_count = */ 0,
1290
- /*.used_cells = */ kv.get_used_cells(),
1291
- /*.max_contiguous = */ 0,
1292
- /*.max_contiguous_idx = */ -1,
1293
- /*.cells = */ nullptr,
1294
- /*.cells_sequences = */ nullptr,
1295
- };
1296
-
1297
- return result;
1298
- }
1299
-
1300
- void llama_kv_cache_view_free(llama_kv_cache_view * view) {
1301
- if (view->cells != nullptr) {
1302
- free(view->cells);
1303
- view->cells = nullptr;
1304
- }
1305
- if (view->cells_sequences != nullptr) {
1306
- free(view->cells_sequences);
1307
- view->cells_sequences = nullptr;
1308
- }
1309
- }
1310
-
1311
- void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
1312
- // TODO: rework this in the future, for now quick hack
1313
- const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
1314
- if (kvu == nullptr) {
1315
- LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
1316
- return;
1317
- }
1318
-
1319
- if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
1320
- view->n_cells = int32_t(kvu->size);
1321
- void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
1322
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
1323
- view->cells = (llama_kv_cache_view_cell *)p;
1324
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
1325
- LM_GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
1326
- view->cells_sequences = (llama_seq_id *)p;
1327
- }
1328
-
1329
- const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
1330
- llama_kv_cache_view_cell * c_curr = view->cells;
1331
- llama_seq_id * cs_curr = view->cells_sequences;
1332
- int32_t used_cells = 0;
1333
- int32_t token_count = 0;
1334
- int32_t curr_contig_idx = -1;
1335
- uint32_t max_contig = 0;
1336
- int32_t max_contig_idx = -1;
1337
-
1338
- for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
1339
- const size_t curr_size = kv_cells[i].seq_id.size();
1340
- token_count += curr_size;
1341
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
1342
-
1343
- if (curr_size > 0) {
1344
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
1345
- max_contig = i - curr_contig_idx;
1346
- max_contig_idx = curr_contig_idx;
1347
- }
1348
- curr_contig_idx = -1;
1349
- } else if (curr_contig_idx < 0) {
1350
- curr_contig_idx = i;
1351
- }
1352
-
1353
- int seq_idx = 0;
1354
- for (const llama_seq_id it : kv_cells[i].seq_id) {
1355
- if (seq_idx >= view->n_seq_max) {
1356
- break;
1357
- }
1358
- cs_curr[seq_idx] = it;
1359
- seq_idx++;
1360
- }
1361
- if (seq_idx != 0) {
1362
- used_cells++;
1363
- }
1364
- for (; seq_idx < view->n_seq_max; seq_idx++) {
1365
- cs_curr[seq_idx] = -1;
1366
- }
1367
- }
1368
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
1369
- max_contig_idx = curr_contig_idx;
1370
- max_contig = kv_cells.size() - curr_contig_idx;
1371
- }
1372
- view->max_contiguous = max_contig;
1373
- view->max_contiguous_idx = max_contig_idx;
1374
- view->token_count = token_count;
1375
- view->used_cells = used_cells;
1376
- if (uint32_t(used_cells) != kvu->used) {
1377
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
1378
- __func__, kvu->used, used_cells);
1379
- }
1380
- }