cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,1841 @@
1
+ #include "llama-kv-cache-unified.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-io.h"
5
+ #include "llama-model.h"
6
+ #include "llama-context.h"
7
+
8
+ #include <algorithm>
9
+ #include <cassert>
10
+ #include <cmath>
11
+ #include <limits>
12
+ #include <map>
13
+ #include <stdexcept>
14
+
15
+ //
16
+ // llama_kv_cache_unified
17
+ //
18
+
19
+ llama_kv_cache_unified::llama_kv_cache_unified(
20
+ const llama_model & model,
21
+ layer_filter_cb && filter,
22
+ lm_ggml_type type_k,
23
+ lm_ggml_type type_v,
24
+ bool v_trans,
25
+ bool offload,
26
+ uint32_t kv_size,
27
+ uint32_t n_seq_max,
28
+ uint32_t n_pad,
29
+ uint32_t n_swa,
30
+ llama_swa_type swa_type) :
31
+ model(model), hparams(model.hparams), v_trans(v_trans),
32
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
33
+
34
+ LM_GGML_ASSERT(kv_size % n_pad == 0);
35
+
36
+ // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
37
+ auto n_layer_cache = hparams.n_layer;
38
+ if (model.arch == LLM_ARCH_GEMMA3N) {
39
+ n_layer_cache = 20;
40
+ }
41
+
42
+ // create a context for each buffer type
43
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
44
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
45
+ auto it = ctx_map.find(buft);
46
+ if (it == ctx_map.end()) {
47
+ lm_ggml_init_params params = {
48
+ /*.mem_size =*/ size_t(2u*n_layer_cache*lm_ggml_tensor_overhead()),
49
+ /*.mem_buffer =*/ NULL,
50
+ /*.no_alloc =*/ true,
51
+ };
52
+
53
+ lm_ggml_context * ctx = lm_ggml_init(params);
54
+ if (!ctx) {
55
+ return nullptr;
56
+ }
57
+
58
+ ctx_map[buft] = ctx;
59
+ ctxs.emplace_back(ctx);
60
+
61
+ return ctx;
62
+ }
63
+
64
+ return it->second;
65
+ };
66
+
67
+ head = 0;
68
+
69
+ cells.resize(kv_size);
70
+
71
+ for (uint32_t il = 0; il < n_layer_cache; il++) {
72
+ if (filter && !filter(il)) {
73
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
74
+ continue;
75
+ }
76
+
77
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
78
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
79
+
80
+ const char * dev_name = "CPU";
81
+
82
+ lm_ggml_backend_buffer_type_t buft = lm_ggml_backend_cpu_buffer_type();
83
+
84
+ if (offload) {
85
+ auto * dev = model.dev_layer(il);
86
+ buft = lm_ggml_backend_dev_buffer_type(dev);
87
+
88
+ dev_name = lm_ggml_backend_dev_name(dev);
89
+ }
90
+
91
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
92
+
93
+ lm_ggml_context * ctx = ctx_for_buft(buft);
94
+ if (!ctx) {
95
+ throw std::runtime_error("failed to create ggml context for kv cache");
96
+ }
97
+
98
+ lm_ggml_tensor * k;
99
+ lm_ggml_tensor * v;
100
+
101
+ k = lm_ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
102
+ v = lm_ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
103
+
104
+ lm_ggml_format_name(k, "cache_k_l%d", il);
105
+ lm_ggml_format_name(v, "cache_v_l%d", il);
106
+
107
+ map_layer_ids[il] = layers.size();
108
+ layers.push_back({ il, k, v });
109
+ }
110
+
111
+ // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
112
+ if (model.arch == LLM_ARCH_GEMMA3N) {
113
+ LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
114
+
115
+ for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
116
+ if (filter && !filter(il)) {
117
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
118
+ continue;
119
+ }
120
+
121
+ const bool is_swa = hparams.is_swa(il);
122
+ const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
123
+
124
+ LM_GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
125
+ map_layer_ids[il] = map_layer_ids[il_reuse];
126
+
127
+ LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
128
+ }
129
+ }
130
+
131
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
132
+ for (auto it : ctx_map) {
133
+ auto * buft = it.first;
134
+ auto * ctx = it.second;
135
+
136
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
137
+ if (!buf) {
138
+ throw std::runtime_error("failed to allocate buffer for kv cache");
139
+ }
140
+
141
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
142
+
143
+ lm_ggml_backend_buffer_clear(buf, 0);
144
+ bufs.emplace_back(buf);
145
+ }
146
+
147
+ {
148
+ const size_t memory_size_k = size_k_bytes();
149
+ const size_t memory_size_v = size_v_bytes();
150
+
151
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
152
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
153
+ lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
154
+ lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
155
+ }
156
+
157
+ const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
158
+ debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
159
+ }
160
+
161
+ void llama_kv_cache_unified::clear(bool data) {
162
+ cells.reset();
163
+
164
+ head = 0;
165
+
166
+ if (data) {
167
+ for (auto & buf : bufs) {
168
+ lm_ggml_backend_buffer_clear(buf.get(), 0);
169
+ }
170
+ }
171
+ }
172
+
173
+ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
174
+ uint32_t new_head = cells.size();
175
+
176
+ if (p0 < 0) {
177
+ p0 = 0;
178
+ }
179
+
180
+ if (p1 < 0) {
181
+ p1 = std::numeric_limits<llama_pos>::max();
182
+ }
183
+
184
+ if (seq_id >= 0) {
185
+ for (uint32_t i = 0; i < cells.size(); ++i) {
186
+ if (!cells.pos_in(i, p0, p1)) {
187
+ continue;
188
+ }
189
+
190
+ if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
191
+ if (new_head == cells.size()) {
192
+ new_head = i;
193
+ }
194
+ }
195
+ }
196
+ } else {
197
+ // match any sequence
198
+ for (uint32_t i = 0; i < cells.size(); ++i) {
199
+ if (!cells.pos_in(i, p0, p1)) {
200
+ continue;
201
+ }
202
+
203
+ cells.rm(i);
204
+
205
+ if (new_head == cells.size()) {
206
+ new_head = i;
207
+ }
208
+ }
209
+ }
210
+
211
+ // If we freed up a slot, set head to it so searching can start there.
212
+ if (new_head != cells.size() && new_head < head) {
213
+ head = new_head;
214
+ }
215
+
216
+ return true;
217
+ }
218
+
219
+ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
220
+ if (seq_id_src == seq_id_dst) {
221
+ return;
222
+ }
223
+
224
+ if (p0 < 0) {
225
+ p0 = 0;
226
+ }
227
+
228
+ if (p1 < 0) {
229
+ p1 = std::numeric_limits<llama_pos>::max();
230
+ }
231
+
232
+ for (uint32_t i = 0; i < cells.size(); ++i) {
233
+ if (!cells.pos_in(i, p0, p1)) {
234
+ continue;
235
+ }
236
+
237
+ if (cells.seq_has(i, seq_id_src)) {
238
+ cells.seq_add(i, seq_id_dst);
239
+ }
240
+ }
241
+ }
242
+
243
+ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
244
+ uint32_t new_head = cells.size();
245
+
246
+ for (uint32_t i = 0; i < cells.size(); ++i) {
247
+ if (cells.seq_keep(i, seq_id)) {
248
+ if (new_head == cells.size()) {
249
+ new_head = i;
250
+ }
251
+ }
252
+ }
253
+
254
+ // If we freed up a slot, set head to it so searching can start there.
255
+ if (new_head != cells.size() && new_head < head) {
256
+ head = new_head;
257
+ }
258
+ }
259
+
260
+ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
261
+ if (shift == 0) {
262
+ return;
263
+ }
264
+
265
+ uint32_t new_head = cells.size();
266
+
267
+ if (p0 < 0) {
268
+ p0 = 0;
269
+ }
270
+
271
+ if (p1 < 0) {
272
+ p1 = std::numeric_limits<llama_pos>::max();
273
+ }
274
+
275
+ // If there is no range then return early to avoid looping over all cells.
276
+ if (p0 == p1) {
277
+ return;
278
+ }
279
+
280
+ for (uint32_t i = 0; i < cells.size(); ++i) {
281
+ if (!cells.pos_in(i, p0, p1)) {
282
+ continue;
283
+ }
284
+
285
+ if (cells.seq_has(i, seq_id)) {
286
+ if (cells.pos_add(i, shift)) {
287
+ if (new_head == cells.size()) {
288
+ new_head = i;
289
+ }
290
+ }
291
+ }
292
+ }
293
+
294
+ // If we freed up a slot, set head to it so searching can start there.
295
+ // Otherwise we just start the next search from the beginning.
296
+ head = new_head != cells.size() ? new_head : 0;
297
+ }
298
+
299
+ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
300
+ if (d == 1) {
301
+ return;
302
+ }
303
+
304
+ if (p0 < 0) {
305
+ p0 = 0;
306
+ }
307
+
308
+ if (p1 < 0) {
309
+ p1 = std::numeric_limits<llama_pos>::max();
310
+ }
311
+
312
+ // If there is no range then return early to avoid looping over the cache.
313
+ if (p0 == p1) {
314
+ return;
315
+ }
316
+
317
+ for (uint32_t i = 0; i < cells.size(); ++i) {
318
+ if (!cells.pos_in(i, p0, p1)) {
319
+ continue;
320
+ }
321
+
322
+ if (cells.seq_has(i, seq_id)) {
323
+ cells.pos_div(i, d);
324
+ }
325
+ }
326
+ }
327
+
328
+ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
329
+ return cells.seq_pos_min(seq_id);
330
+ }
331
+
332
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
333
+ return cells.seq_pos_max(seq_id);
334
+ }
335
+
336
+ llama_memory_context_ptr llama_kv_cache_unified::init_batch(
337
+ llama_batch_allocr & balloc,
338
+ uint32_t n_ubatch,
339
+ bool embd_all) {
340
+ LM_GGML_UNUSED(embd_all);
341
+
342
+ do {
343
+ balloc.split_reset();
344
+
345
+ std::vector<llama_ubatch> ubatches;
346
+ while (true) {
347
+ auto ubatch = balloc.split_simple(n_ubatch);
348
+
349
+ if (ubatch.n_tokens == 0) {
350
+ break;
351
+ }
352
+
353
+ ubatches.push_back(std::move(ubatch)); // NOLINT
354
+ }
355
+
356
+ auto heads = prepare(ubatches);
357
+ if (heads.empty()) {
358
+ break;
359
+ }
360
+
361
+ return std::make_unique<llama_kv_cache_unified_context>(
362
+ this, std::move(heads), std::move(ubatches));
363
+ } while (false);
364
+
365
+ return std::make_unique<llama_kv_cache_unified_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
366
+ }
367
+
368
+ llama_memory_context_ptr llama_kv_cache_unified::init_full() {
369
+ return std::make_unique<llama_kv_cache_unified_context>(this);
370
+ }
371
+
372
+ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
373
+ bool do_shift = get_has_shift();
374
+
375
+ defrag_info dinfo;
376
+
377
+ // see if we need to defrag
378
+ {
379
+ bool do_defrag = optimize;
380
+
381
+ const auto thold = lctx->get_cparams().defrag_thold;
382
+
383
+ if (!do_defrag && thold > 0.0f) {
384
+ const auto n_kv = cells.used_max_p1();
385
+
386
+ // - do not defrag small contexts (i.e. < 2048 tokens)
387
+ // - count the padding towards the number of used tokens
388
+ const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
389
+
390
+ if (fragmentation > thold) {
391
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
392
+
393
+ do_defrag = true;
394
+ }
395
+ }
396
+
397
+ if (do_defrag) {
398
+ dinfo = defrag_prepare(lctx->graph_max_nodes());
399
+ }
400
+ }
401
+
402
+ return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo));
403
+ }
404
+
405
+ llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
406
+ llama_kv_cache_unified::ubatch_heads res;
407
+
408
+ struct state {
409
+ uint32_t head_old; // old position of the head, before placing the ubatch
410
+ uint32_t head_new; // new position of the head, after placing the ubatch
411
+
412
+ llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
413
+ };
414
+
415
+ // remember the old state of the cells so we can restore it in the end
416
+ std::vector<state> states;
417
+
418
+ bool success = true;
419
+
420
+ for (const auto & ubatch : ubatches) {
421
+ // only find a suitable slot for the ubatch. don't modify the cells yet
422
+ const int32_t head_new = find_slot(ubatch);
423
+ if (head_new < 0) {
424
+ success = false;
425
+ break;
426
+ }
427
+
428
+ // remeber the position that we found
429
+ res.push_back(head_new);
430
+
431
+ // store the old state of the cells in the recovery stack
432
+ states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)});
433
+
434
+ // now emplace the ubatch
435
+ apply_ubatch(head_new, ubatch);
436
+ }
437
+
438
+ // iterate backwards and restore the cells to their original state
439
+ for (auto it = states.rbegin(); it != states.rend(); ++it) {
440
+ cells.set(it->head_new, it->cells);
441
+ head = it->head_old;
442
+ }
443
+
444
+ if (!success) {
445
+ return {};
446
+ }
447
+
448
+ return res;
449
+ }
450
+
451
+ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo) {
452
+ bool updated = false;
453
+
454
+ auto * sched = lctx->get_sched();
455
+
456
+ if (do_shift) {
457
+ if (!get_can_shift()) {
458
+ LM_GGML_ABORT("The current KV cache / model configuration does not support K-shift");
459
+ }
460
+
461
+ LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
462
+
463
+ // apply K-shift if needed
464
+ if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
465
+ lm_ggml_backend_sched_reset(sched);
466
+
467
+ auto * gf = lctx->graph_init();
468
+
469
+ auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
470
+ if (!res) {
471
+ LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
472
+ return updated;
473
+ }
474
+
475
+ if (!lm_ggml_backend_sched_alloc_graph(sched, gf)) {
476
+ LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
477
+ return updated;
478
+ }
479
+
480
+ res->set_inputs(nullptr);
481
+
482
+ if (lctx->graph_compute(gf, false) != LM_GGML_STATUS_SUCCESS) {
483
+ LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
484
+ return updated;
485
+ }
486
+
487
+ updated = true;
488
+ }
489
+
490
+ cells.reset_shift();
491
+ }
492
+
493
+ if (!dinfo.empty()) {
494
+ LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
495
+
496
+ // apply moves:
497
+ {
498
+ const auto n_kv = dinfo.ids.size();
499
+
500
+ for (uint32_t i = 0; i < n_kv; ++i) {
501
+ assert(dinfo.ids[i] <= n_kv);
502
+
503
+ if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
504
+ continue;
505
+ }
506
+
507
+ cells.mv(i, dinfo.ids[i]);
508
+ }
509
+
510
+ // reset the head so we can find the first free slot during the next ubatch
511
+ head = 0;
512
+ }
513
+
514
+ lm_ggml_backend_sched_reset(sched);
515
+
516
+ auto * gf = lctx->graph_init();
517
+
518
+ auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
519
+ if (!res) {
520
+ LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
521
+ return updated;
522
+ }
523
+
524
+ if (!lm_ggml_backend_sched_alloc_graph(sched, gf)) {
525
+ LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
526
+ return updated;
527
+ }
528
+
529
+ res->set_inputs(nullptr);
530
+
531
+ if (lctx->graph_compute(gf, false) != LM_GGML_STATUS_SUCCESS) {
532
+ LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
533
+ return updated;
534
+ }
535
+
536
+ updated = true;
537
+ }
538
+
539
+ return updated;
540
+ }
541
+
542
+ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
543
+ const uint32_t n_tokens = ubatch.n_tokens;
544
+
545
+ uint32_t head_cur = this->head;
546
+
547
+ // if we have enough unused cells before the current head ->
548
+ // better to start searching from the beginning of the cache, hoping to fill it
549
+ if (head_cur > cells.get_used() + 2*ubatch.n_tokens) {
550
+ head_cur = 0;
551
+ }
552
+
553
+ if (n_tokens > cells.size()) {
554
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
555
+ return -1;
556
+ }
557
+
558
+ if (debug > 0) {
559
+ LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
560
+
561
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
562
+ std::string ss;
563
+ for (uint32_t i = 0; i < cells.size(); ++i) {
564
+ if (cells.is_empty(i)) {
565
+ ss += '.';
566
+ } else {
567
+ assert(cells.seq_count(i) >= 1);
568
+
569
+ if (cells.seq_count(i) == 1) {
570
+ ss += std::to_string(cells.seq_get(i));
571
+ } else {
572
+ ss += 'M';
573
+ }
574
+ }
575
+ if (i%256 == 255) {
576
+ ss += " *";
577
+ ss += '\n';
578
+ }
579
+ }
580
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
581
+ }
582
+
583
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
584
+ std::string ss;
585
+ for (uint32_t i = 0; i < cells.size(); ++i) {
586
+ std::string cur;
587
+ if (cells.is_empty(i)) {
588
+ cur = '.';
589
+ } else {
590
+ cur = std::to_string(cells.pos_get(i));
591
+ }
592
+ const int n = cur.size();
593
+ for (int j = 0; j < 5 - n; ++j) {
594
+ cur += ' ';
595
+ }
596
+ ss += cur;
597
+ if (i%256 == 255) {
598
+ ss += " *";
599
+ }
600
+ if (i%64 == 63) {
601
+ ss += '\n';
602
+ }
603
+ }
604
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
605
+ }
606
+
607
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
608
+ if (cells.seq_pos_min(s) < 0) {
609
+ continue;
610
+ }
611
+
612
+ LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
613
+ }
614
+ }
615
+
616
+ uint32_t n_tested = 0;
617
+
618
+ while (true) {
619
+ if (head_cur + n_tokens > cells.size()) {
620
+ n_tested += cells.size() - head_cur;
621
+ head_cur = 0;
622
+ continue;
623
+ }
624
+
625
+ bool found = true;
626
+ for (uint32_t i = 0; i < n_tokens; i++) {
627
+ //const llama_pos pos = ubatch.pos[i];
628
+ //const llama_seq_id seq_id = ubatch.seq_id[i][0];
629
+
630
+ // can we use this cell? either:
631
+ // - the cell is empty
632
+ // - the cell is occupied only by one sequence:
633
+ // - (disabled) mask causally, if the sequence is the same as the one we are inserting
634
+ // - mask SWA, using current max pos for that sequence in the cache
635
+ // always insert in the cell with minimum pos
636
+ bool can_use = cells.is_empty(head_cur + i);
637
+
638
+ if (!can_use && cells.seq_count(head_cur + i) == 1) {
639
+ const llama_pos pos_cell = cells.pos_get(head_cur + i);
640
+
641
+ // (disabled) causal mask
642
+ // note: it's better to purge any "future" tokens beforehand
643
+ //if (cells.seq_has(head_cur + i, seq_id)) {
644
+ // can_use = pos_cell >= pos;
645
+ //}
646
+
647
+ if (!can_use) {
648
+ const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
649
+
650
+ // SWA mask
651
+ if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
652
+ can_use = true;
653
+ }
654
+ }
655
+ }
656
+
657
+ if (!can_use) {
658
+ found = false;
659
+ head_cur += i + 1;
660
+ n_tested += i + 1;
661
+ break;
662
+ }
663
+ }
664
+
665
+ if (found) {
666
+ break;
667
+ }
668
+
669
+ if (n_tested >= cells.size()) {
670
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
671
+ return -1;
672
+ }
673
+ }
674
+
675
+ return head_cur;
676
+ }
677
+
678
+ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
679
+ // keep track of the max sequence position that we would overwrite with this ubatch
680
+ // for non-SWA cache, this would be always empty
681
+ llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
682
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
683
+ seq_pos_max_rm[s] = -1;
684
+ }
685
+
686
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
687
+ if (!cells.is_empty(head_cur + i)) {
688
+ assert(cells.seq_count(head_cur + i) == 1);
689
+
690
+ const llama_seq_id seq_id = cells.seq_get(head_cur + i);
691
+ const llama_pos pos = cells.pos_get(head_cur + i);
692
+
693
+ seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
694
+
695
+ cells.rm(head_cur + i);
696
+ }
697
+
698
+ cells.pos_set(head_cur + i, ubatch.pos[i]);
699
+
700
+ for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
701
+ cells.seq_add(head_cur + i, ubatch.seq_id[i][s]);
702
+ }
703
+ }
704
+
705
+ // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
706
+ // will be present in the cache. so we have to purge any position which is less than those we would overwrite
707
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
708
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
709
+ if (seq_pos_max_rm[s] == -1) {
710
+ continue;
711
+ }
712
+
713
+ if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
714
+ LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
715
+ __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
716
+
717
+ seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
718
+ }
719
+ }
720
+
721
+ // move the head at the end of the slot
722
+ head = head_cur + ubatch.n_tokens;
723
+ }
724
+
725
+ bool llama_kv_cache_unified::get_can_shift() const {
726
+ return true;
727
+ }
728
+
729
+ uint32_t llama_kv_cache_unified::get_size() const {
730
+ return cells.size();
731
+ }
732
+
733
+ bool llama_kv_cache_unified::get_has_shift() const {
734
+ return cells.get_has_shift();
735
+ }
736
+
737
+ uint32_t llama_kv_cache_unified::get_n_kv() const {
738
+ return std::min(cells.size(), std::max(n_pad, LM_GGML_PAD(cells.used_max_p1(), n_pad)));
739
+ }
740
+
741
+ lm_ggml_tensor * llama_kv_cache_unified::get_k(lm_ggml_context * ctx, int32_t il, uint32_t n_kv) const {
742
+ const int32_t ikv = map_layer_ids.at(il);
743
+
744
+ auto * k = layers[ikv].k;
745
+
746
+ return lm_ggml_view_3d(ctx, k,
747
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv,
748
+ lm_ggml_row_size(k->type, hparams.n_embd_head_k),
749
+ lm_ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
750
+ 0);
751
+ }
752
+
753
+ lm_ggml_tensor * llama_kv_cache_unified::get_v(lm_ggml_context * ctx, int32_t il, uint32_t n_kv) const {
754
+ const int32_t ikv = map_layer_ids.at(il);
755
+
756
+ auto * v = layers[ikv].v;
757
+
758
+ if (!v_trans) {
759
+ // note: v->nb[1] <= v->nb[2]
760
+ return lm_ggml_view_3d(ctx, v,
761
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv,
762
+ lm_ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
763
+ lm_ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
764
+ 0);
765
+ }
766
+
767
+ // note: v->nb[1] > v->nb[2]
768
+ return lm_ggml_view_3d(ctx, v,
769
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v,
770
+ lm_ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
771
+ lm_ggml_row_size(v->type, v->ne[1]), // v->nb[2]
772
+ 0);
773
+ }
774
+
775
+ lm_ggml_tensor * llama_kv_cache_unified::cpy_k(lm_ggml_context * ctx, lm_ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
776
+ const int32_t ikv = map_layer_ids.at(il);
777
+
778
+ auto * k = layers[ikv].k;
779
+
780
+ const int64_t n_tokens = k_cur->ne[2];
781
+
782
+ lm_ggml_tensor * k_view = lm_ggml_view_1d(ctx, k,
783
+ n_tokens*hparams.n_embd_k_gqa(il),
784
+ lm_ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
785
+
786
+ return lm_ggml_cpy(ctx, k_cur, k_view);
787
+ }
788
+
789
+ lm_ggml_tensor * llama_kv_cache_unified::cpy_v(lm_ggml_context * ctx, lm_ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
790
+ const int32_t ikv = map_layer_ids.at(il);
791
+
792
+ auto * v = layers[ikv].v;
793
+
794
+ const int64_t n_tokens = v_cur->ne[2];
795
+
796
+ v_cur = lm_ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
797
+
798
+ lm_ggml_tensor * v_view = nullptr;
799
+
800
+ if (!v_trans) {
801
+ v_view = lm_ggml_view_1d(ctx, v,
802
+ n_tokens*hparams.n_embd_v_gqa(il),
803
+ lm_ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
804
+ } else {
805
+ // note: the V cache is transposed when not using flash attention
806
+ v_view = lm_ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
807
+ (v->ne[1])*lm_ggml_element_size(v),
808
+ (head_cur)*lm_ggml_element_size(v));
809
+
810
+ v_cur = lm_ggml_transpose(ctx, v_cur);
811
+ }
812
+
813
+ return lm_ggml_cpy(ctx, v_cur, v_view);
814
+ }
815
+
816
+ void llama_kv_cache_unified::set_input_kq_mask(lm_ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
817
+ const uint32_t n_tokens = ubatch->n_tokens;
818
+
819
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
820
+ float * data = (float *) dst->data;
821
+
822
+ const int64_t n_kv = dst->ne[0];
823
+
824
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
825
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
826
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
827
+ // Causal mask:
828
+ // xxx-------
829
+ // xxxx------
830
+ // xxxxx-----
831
+ // Non-causal mask:
832
+ // xxxxx-----
833
+ // xxxxx-----
834
+ // xxxxx-----
835
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
836
+ for (uint32_t h = 0; h < 1; ++h) {
837
+ for (uint32_t i = 0; i < n_tokens; ++i) {
838
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
839
+
840
+ const llama_pos p1 = ubatch->pos[i];
841
+
842
+ for (uint32_t j = 0; j < n_kv; ++j) {
843
+ float f = 0.0f;
844
+
845
+ bool masked = false;
846
+
847
+ if (cells.is_empty(j)) {
848
+ masked = true;
849
+ } else {
850
+ const llama_pos p0 = cells.pos_get(j);
851
+
852
+ // mask the token if not the same sequence
853
+ masked = masked || (!cells.seq_has(j, seq_id));
854
+
855
+ // mask future tokens
856
+ masked = masked || (causal_attn && p0 > p1);
857
+
858
+ // apply SWA if any
859
+ masked = masked || (is_masked_swa(p0, p1));
860
+
861
+ if (!masked && hparams.use_alibi) {
862
+ f = -std::abs(p0 - p1);
863
+ }
864
+ }
865
+
866
+ if (masked) {
867
+ f = -INFINITY;
868
+ }
869
+
870
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = f;
871
+ }
872
+ }
873
+
874
+ // mask padded tokens
875
+ if (data) {
876
+ for (uint32_t i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
877
+ for (uint32_t j = 0; j < n_kv; ++j) {
878
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
879
+ }
880
+ }
881
+ }
882
+ }
883
+ }
884
+
885
+ void llama_kv_cache_unified::set_input_k_shift(lm_ggml_tensor * dst) const {
886
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
887
+
888
+ int32_t * data = (int32_t *) dst->data;
889
+
890
+ for (uint32_t i = 0; i < cells.size(); ++i) {
891
+ data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
892
+ }
893
+ }
894
+
895
+ void llama_kv_cache_unified::set_input_pos_bucket(lm_ggml_tensor * dst, const llama_ubatch * ubatch) const {
896
+ const int64_t n_tokens = ubatch->n_tokens;
897
+
898
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(dst->buffer));
899
+ LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
900
+
901
+ int32_t * data = (int32_t *) dst->data;
902
+
903
+ const int32_t n_kv = dst->ne[0];
904
+
905
+ for (int h = 0; h < 1; ++h) {
906
+ for (int i = 0; i < n_tokens; ++i) {
907
+ for (int j = 0; j < n_kv; ++j) {
908
+ // the position when the cells is empty is irrelevant - it will be masked out later in the attention
909
+ const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);
910
+
911
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = llama_relative_position_bucket(p0, ubatch->pos[i], hparams.n_rel_attn_bkts, false);
912
+ }
913
+ }
914
+ }
915
+ }
916
+
917
+ size_t llama_kv_cache_unified::total_size() const {
918
+ size_t size = 0;
919
+
920
+ for (const auto & buf : bufs) {
921
+ size += lm_ggml_backend_buffer_get_size(buf.get());
922
+ }
923
+
924
+ return size;
925
+ }
926
+
927
+ size_t llama_kv_cache_unified::size_k_bytes() const {
928
+ size_t size_k_bytes = 0;
929
+
930
+ for (const auto & layer : layers) {
931
+ size_k_bytes += lm_ggml_nbytes(layer.k);
932
+ }
933
+
934
+ return size_k_bytes;
935
+ }
936
+
937
+ size_t llama_kv_cache_unified::size_v_bytes() const {
938
+ size_t size_v_bytes = 0;
939
+
940
+ for (const auto & layer : layers) {
941
+ size_v_bytes += lm_ggml_nbytes(layer.v);
942
+ }
943
+
944
+ return size_v_bytes;
945
+ }
946
+
947
+ lm_ggml_tensor * llama_kv_cache_unified::build_rope_shift(
948
+ const llama_cparams & cparams,
949
+ lm_ggml_context * ctx,
950
+ lm_ggml_tensor * cur,
951
+ lm_ggml_tensor * shift,
952
+ lm_ggml_tensor * factors,
953
+ float freq_base,
954
+ float freq_scale) const {
955
+ const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
956
+
957
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
958
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
959
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
960
+
961
+ const auto & n_rot = hparams.n_rot;
962
+ const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
963
+ // @ngxson : this is a workaround
964
+ // for M-RoPE, we want to rotate the whole vector when doing KV shift
965
+ // a normal RoPE should work, we just need to use the correct ordering
966
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13870
967
+ ? LLAMA_ROPE_TYPE_NEOX
968
+ : hparams.rope_type;
969
+
970
+ // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
971
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
972
+ const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
973
+ ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
974
+ : cparams.yarn_attn_factor;
975
+
976
+ lm_ggml_tensor * tmp;
977
+
978
+ if (lm_ggml_is_quantized(cur->type)) {
979
+ // dequantize to f32 -> RoPE -> quantize back
980
+ tmp = lm_ggml_cast(ctx, cur, LM_GGML_TYPE_F32);
981
+
982
+ tmp = lm_ggml_rope_ext(ctx, tmp,
983
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
984
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
985
+
986
+ tmp = lm_ggml_cpy(ctx, tmp, cur);
987
+ } else {
988
+ // we rotate only the first n_rot dimensions
989
+ tmp = lm_ggml_rope_ext_inplace(ctx, cur,
990
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
991
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
992
+ }
993
+
994
+ return tmp;
995
+ }
996
+
997
+ class llm_graph_input_k_shift : public llm_graph_input_i {
998
+ public:
999
+ llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
1000
+ virtual ~llm_graph_input_k_shift() = default;
1001
+
1002
+ void set_input(const llama_ubatch * ubatch) override;
1003
+
1004
+ lm_ggml_tensor * k_shift; // I32 [kv_size]
1005
+
1006
+ const llama_kv_cache_unified * kv_self;
1007
+ };
1008
+
1009
+ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
1010
+ LM_GGML_UNUSED(ubatch);
1011
+
1012
+ if (k_shift) {
1013
+ kv_self->set_input_k_shift(k_shift);
1014
+ }
1015
+ }
1016
+
1017
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
1018
+ const llama_cparams & cparams,
1019
+ lm_ggml_context * ctx,
1020
+ lm_ggml_cgraph * gf) const {
1021
+ auto res = std::make_unique<llm_graph_result>();
1022
+
1023
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
1024
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
1025
+
1026
+ auto inp = std::make_unique<llm_graph_input_k_shift>(this);
1027
+
1028
+ inp->k_shift = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, cells.size());
1029
+ lm_ggml_set_input(inp->k_shift);
1030
+
1031
+ for (const auto & layer : layers) {
1032
+ const uint32_t il = layer.il;
1033
+
1034
+ const int64_t n_head_kv = hparams.n_head_kv(il);
1035
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1036
+
1037
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
1038
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
1039
+
1040
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
1041
+
1042
+ lm_ggml_tensor * k =
1043
+ lm_ggml_view_3d(ctx, layer.k,
1044
+ n_embd_head_k, n_head_kv, cells.size(),
1045
+ lm_ggml_row_size(layer.k->type, n_embd_head_k),
1046
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
1047
+ 0);
1048
+
1049
+ lm_ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
1050
+
1051
+ lm_ggml_build_forward_expand(gf, cur);
1052
+ }
1053
+
1054
+ res->add_input(std::move(inp));
1055
+
1056
+ return res;
1057
+ }
1058
+
1059
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
1060
+ const llama_cparams & cparams,
1061
+ lm_ggml_context * ctx,
1062
+ lm_ggml_cgraph * gf,
1063
+ const defrag_info & dinfo) const {
1064
+ auto res = std::make_unique<llm_graph_result>();
1065
+
1066
+ const auto & ids = dinfo.ids;
1067
+
1068
+ #if 0
1069
+ // CPU defrag
1070
+ //
1071
+ // TODO: optimizations are possible:
1072
+ // - multiple threads
1073
+ // - avoid copying to the host memory when already there
1074
+ //
1075
+ // likely not worth the effort, as we have lm_ggml_graph based defrag
1076
+ //
1077
+
1078
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1079
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1080
+
1081
+ const uint32_t kv_size = size;
1082
+
1083
+ std::vector<uint8_t> buf_k;
1084
+ std::vector<uint8_t> buf_v;
1085
+
1086
+ for (uint32_t il = 0; il < n_layer; ++il) {
1087
+ const size_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1088
+ const size_t k_size = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
1089
+
1090
+ const size_t v_size_el = lm_ggml_type_size(v_l[il]->type);
1091
+ const size_t v_size = lm_ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
1092
+
1093
+ buf_k.resize(k_size);
1094
+ buf_v.resize(v_size);
1095
+
1096
+ lm_ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
1097
+ lm_ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
1098
+
1099
+ // batch move [i, i+nm) to [id, id+nm)
1100
+ // note: cells can move only to a lower index
1101
+ for (uint32_t i = 0; i < n_kv; ++i) {
1102
+ const uint32_t id = ids[i];
1103
+
1104
+ if (i == id || id == n_kv) {
1105
+ continue;
1106
+ }
1107
+
1108
+ uint32_t nm = 1;
1109
+
1110
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
1111
+ nm++;
1112
+ }
1113
+
1114
+ // move keys
1115
+ {
1116
+ const int64_t os = i*k_size_row;
1117
+ const int64_t od = id*k_size_row;
1118
+
1119
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
1120
+ }
1121
+
1122
+ // move values (note: they are transposed)
1123
+ {
1124
+ const int64_t os = i;
1125
+ const int64_t od = id;
1126
+
1127
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1128
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
1129
+ }
1130
+ }
1131
+
1132
+ i += nm - 1;
1133
+ }
1134
+
1135
+ lm_ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
1136
+ lm_ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
1137
+ }
1138
+ #else
1139
+ for (uint32_t i = 0; i < ids.size(); ++i) {
1140
+ const uint32_t id = ids[i];
1141
+
1142
+ if (i == id || id == ids.size()) {
1143
+ continue;
1144
+ }
1145
+
1146
+ uint32_t nm = 1;
1147
+
1148
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
1149
+ nm++;
1150
+ }
1151
+
1152
+ for (const auto & layer : layers) {
1153
+ const uint32_t il = layer.il;
1154
+
1155
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1156
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1157
+
1158
+ lm_ggml_tensor * view_k_src = lm_ggml_view_2d(ctx, layer.k,
1159
+ n_embd_k_gqa, nm,
1160
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
1161
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa*i));
1162
+
1163
+ lm_ggml_tensor * view_k_dst = lm_ggml_view_2d(ctx, layer.k,
1164
+ n_embd_k_gqa, nm,
1165
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa),
1166
+ lm_ggml_row_size(layer.k->type, n_embd_k_gqa*id));
1167
+
1168
+ lm_ggml_tensor * view_v_src;
1169
+ lm_ggml_tensor * view_v_dst;
1170
+
1171
+ if (cparams.flash_attn) {
1172
+ // NOTE: the V cache is not transposed when using flash attention
1173
+ view_v_src = lm_ggml_view_2d(ctx, layer.v,
1174
+ n_embd_v_gqa, nm,
1175
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa),
1176
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa*i));
1177
+
1178
+ view_v_dst = lm_ggml_view_2d(ctx, layer.v,
1179
+ n_embd_v_gqa, nm,
1180
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa),
1181
+ lm_ggml_row_size(layer.v->type, n_embd_v_gqa*id));
1182
+ } else {
1183
+ view_v_src = lm_ggml_view_2d(ctx, layer.v,
1184
+ nm, n_embd_v_gqa,
1185
+ lm_ggml_row_size(layer.v->type, cells.size()),
1186
+ lm_ggml_row_size(layer.v->type, i));
1187
+
1188
+ view_v_dst = lm_ggml_view_2d(ctx, layer.v,
1189
+ nm, n_embd_v_gqa,
1190
+ lm_ggml_row_size(layer.v->type, cells.size()),
1191
+ lm_ggml_row_size(layer.v->type, id));
1192
+ }
1193
+
1194
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx, view_k_src, view_k_dst));
1195
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx, view_v_src, view_v_dst));
1196
+ }
1197
+
1198
+ i += nm - 1;
1199
+ }
1200
+
1201
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
1202
+ #endif
1203
+
1204
+ return res;
1205
+ }
1206
+
1207
+ llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
1208
+ const uint32_t n_layer = layers.size();
1209
+
1210
+ const uint32_t n_kv = cells.used_max_p1();
1211
+ const uint32_t n_used = cells.get_used();
1212
+
1213
+ assert(n_used <= n_kv);
1214
+
1215
+ //const int64_t t_start = lm_ggml_time_us();
1216
+
1217
+ // number of cells moved
1218
+ uint32_t n_moves = 0;
1219
+
1220
+ // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
1221
+ // - source view, destination view, copy operation
1222
+ // - x2 for keys and values
1223
+ //const uint32_t max_moves = max_nodes()/(6*n_layer);
1224
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
1225
+ const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
1226
+
1227
+ // determine which KV cells to move where
1228
+ defrag_info res;
1229
+ auto & ids = res.ids;
1230
+
1231
+ ids.resize(n_kv, n_kv);
1232
+
1233
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
1234
+ if (!cells.is_empty(i0)) {
1235
+ ids[i0] = i0;
1236
+
1237
+ continue;
1238
+ }
1239
+
1240
+ // found a hole - fill it with data from the end of the cache
1241
+
1242
+ uint32_t nh = 1;
1243
+
1244
+ // determine the size of the hole
1245
+ while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
1246
+ nh++;
1247
+ }
1248
+
1249
+ uint32_t nf = 0;
1250
+ uint32_t is = n_kv - 1;
1251
+
1252
+ // starting from the end, find nh non-empty cells
1253
+ for (; is > i0; --is) {
1254
+ if (cells.is_empty(is) || ids[is] != n_kv) {
1255
+ continue;
1256
+ }
1257
+
1258
+ // non-empty cell which is not yet moved
1259
+ nf++;
1260
+
1261
+ if (nf == nh) {
1262
+ break;
1263
+ }
1264
+ }
1265
+
1266
+ // this can only happen if `n_used` is not accurate, which would be a bug
1267
+ LM_GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
1268
+
1269
+ nf = 0;
1270
+
1271
+ uint32_t i1 = is;
1272
+
1273
+ // are we moving a continuous block of memory?
1274
+ bool cont = false;
1275
+
1276
+ // should we stop searching for the next move?
1277
+ bool stop = false;
1278
+
1279
+ // go back and move the nf cells to the hole
1280
+ for (; i1 < n_kv; ++i1) {
1281
+ if (cells.is_empty(i1) || ids[i1] != n_kv) {
1282
+ if (n_moves == max_moves) {
1283
+ stop = true;
1284
+ break;
1285
+ }
1286
+
1287
+ cont = false;
1288
+ continue;
1289
+ }
1290
+
1291
+ // this cell goes to (i0 + nf)
1292
+ ids[i1] = i0 + nf;
1293
+
1294
+ if (!cont) {
1295
+ n_moves++;
1296
+ cont = true;
1297
+ }
1298
+
1299
+ nf++;
1300
+
1301
+ if (nf == nh) {
1302
+ break;
1303
+ }
1304
+ }
1305
+
1306
+ if (stop || n_moves == max_moves) {
1307
+ break;
1308
+ }
1309
+
1310
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
1311
+
1312
+ i0 += nh - 1;
1313
+ }
1314
+
1315
+ if (n_moves == 0) {
1316
+ return {};
1317
+ }
1318
+
1319
+ LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
1320
+
1321
+ LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
1322
+
1323
+ return res;
1324
+ }
1325
+
1326
+ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1327
+ assert(p0 >= 0 && p1 >= 0);
1328
+
1329
+ switch (swa_type) {
1330
+ case LLAMA_SWA_TYPE_NONE:
1331
+ {
1332
+ } break;
1333
+ case LLAMA_SWA_TYPE_STANDARD:
1334
+ {
1335
+ if (p1 - p0 >= (int32_t) n_swa) {
1336
+ return true;
1337
+ }
1338
+ } break;
1339
+ case LLAMA_SWA_TYPE_CHUNKED:
1340
+ {
1341
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1342
+
1343
+ if (p0 < pos_chunk_start) {
1344
+ return true;
1345
+ }
1346
+ } break;
1347
+ }
1348
+
1349
+ return false;
1350
+ }
1351
+
1352
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1353
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
1354
+ uint32_t cell_count = 0;
1355
+
1356
+ // Count the number of cells with the specified seq_id
1357
+ // Find all the ranges of cells with this seq id (or all, when -1)
1358
+ uint32_t cell_range_begin = cells.size();
1359
+
1360
+ for (uint32_t i = 0; i < cells.size(); ++i) {
1361
+ if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
1362
+ ++cell_count;
1363
+ if (cell_range_begin == cells.size()) {
1364
+ cell_range_begin = i;
1365
+ }
1366
+ } else {
1367
+ if (cell_range_begin != cells.size()) {
1368
+ cell_ranges.emplace_back(cell_range_begin, i);
1369
+ cell_range_begin = cells.size();
1370
+ }
1371
+ }
1372
+ }
1373
+
1374
+ if (cell_range_begin != cells.size()) {
1375
+ cell_ranges.emplace_back(cell_range_begin, cells.size());
1376
+ }
1377
+
1378
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
1379
+ uint32_t cell_count_check = 0;
1380
+ for (const auto & range : cell_ranges) {
1381
+ cell_count_check += range.second - range.first;
1382
+ }
1383
+ LM_GGML_ASSERT(cell_count == cell_count_check);
1384
+
1385
+ io.write(&cell_count, sizeof(cell_count));
1386
+
1387
+ state_write_meta(io, cell_ranges, seq_id);
1388
+ state_write_data(io, cell_ranges);
1389
+ }
1390
+
1391
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1392
+ uint32_t cell_count;
1393
+ io.read_to(&cell_count, sizeof(cell_count));
1394
+
1395
+ bool res = true;
1396
+ res = res && state_read_meta(io, cell_count, seq_id);
1397
+ res = res && state_read_data(io, cell_count);
1398
+
1399
+ if (!res) {
1400
+ if (seq_id == -1) {
1401
+ clear(true);
1402
+ } else {
1403
+ seq_rm(seq_id, -1, -1);
1404
+ }
1405
+ throw std::runtime_error("failed to restore kv cache");
1406
+ }
1407
+ }
1408
+
1409
+ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
1410
+ for (const auto & range : cell_ranges) {
1411
+ for (uint32_t i = range.first; i < range.second; ++i) {
1412
+ std::vector<llama_seq_id> seq_ids;
1413
+
1414
+ for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
1415
+ if (cur == seq_id || seq_id == -1) {
1416
+ if (cells.seq_has(i, cur)) {
1417
+ seq_ids.push_back(cur);
1418
+ }
1419
+ }
1420
+ }
1421
+
1422
+ const llama_pos pos = cells.pos_get(i);
1423
+ const uint32_t n_seq_id = seq_ids.size();
1424
+
1425
+ io.write(&pos, sizeof(pos));
1426
+ io.write(&n_seq_id, sizeof(n_seq_id));
1427
+
1428
+ for (const auto & seq_id : seq_ids) {
1429
+ io.write(&seq_id, sizeof(seq_id));
1430
+ }
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
1436
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
1437
+ const uint32_t n_layer = layers.size();
1438
+
1439
+ io.write(&v_trans, sizeof(v_trans));
1440
+ io.write(&n_layer, sizeof(n_layer));
1441
+
1442
+ std::vector<uint8_t> tmp_buf;
1443
+
1444
+ // Iterate and write all the keys first, each row is a cell
1445
+ // Get whole range at a time
1446
+ for (const auto & layer : layers) {
1447
+ const uint32_t il = layer.il;
1448
+
1449
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1450
+
1451
+ // Write key type
1452
+ const int32_t k_type_i = (int32_t)layer.k->type;
1453
+ io.write(&k_type_i, sizeof(k_type_i));
1454
+
1455
+ // Write row size of key
1456
+ const uint64_t k_size_row = lm_ggml_row_size(layer.k->type, n_embd_k_gqa);
1457
+ io.write(&k_size_row, sizeof(k_size_row));
1458
+
1459
+ // Read each range of cells of k_size length each into tmp_buf and write out
1460
+ for (const auto & range : cell_ranges) {
1461
+ const size_t range_size = range.second - range.first;
1462
+ const size_t buf_size = range_size * k_size_row;
1463
+ io.write_tensor(layer.k, range.first * k_size_row, buf_size);
1464
+ }
1465
+ }
1466
+
1467
+ if (!v_trans) {
1468
+ for (const auto & layer : layers) {
1469
+ const uint32_t il = layer.il;
1470
+
1471
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1472
+
1473
+ // Write value type
1474
+ const int32_t v_type_i = (int32_t)layer.v->type;
1475
+ io.write(&v_type_i, sizeof(v_type_i));
1476
+
1477
+ // Write row size of value
1478
+ const uint64_t v_size_row = lm_ggml_row_size(layer.v->type, n_embd_v_gqa);
1479
+ io.write(&v_size_row, sizeof(v_size_row));
1480
+
1481
+ // Read each range of cells of v_size length each into tmp_buf and write out
1482
+ for (const auto & range : cell_ranges) {
1483
+ const size_t range_size = range.second - range.first;
1484
+ const size_t buf_size = range_size * v_size_row;
1485
+ io.write_tensor(layer.v, range.first * v_size_row, buf_size);
1486
+ }
1487
+ }
1488
+ } else {
1489
+ // When v is transposed, we also need the element size and get the element ranges from each row
1490
+ const uint32_t kv_size = cells.size();
1491
+
1492
+ for (const auto & layer : layers) {
1493
+ const uint32_t il = layer.il;
1494
+
1495
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1496
+
1497
+ // Write value type
1498
+ const int32_t v_type_i = (int32_t)layer.v->type;
1499
+ io.write(&v_type_i, sizeof(v_type_i));
1500
+
1501
+ // Write element size
1502
+ const uint32_t v_size_el = lm_ggml_type_size(layer.v->type);
1503
+ io.write(&v_size_el, sizeof(v_size_el));
1504
+
1505
+ // Write GQA embedding size
1506
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1507
+
1508
+ // For each row, we get the element values of each cell
1509
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1510
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1511
+ for (const auto & range : cell_ranges) {
1512
+ const size_t range_size = range.second - range.first;
1513
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1514
+ const size_t buf_size = range_size * v_size_el;
1515
+ io.write_tensor(layer.v, src_offset, buf_size);
1516
+ }
1517
+ }
1518
+ }
1519
+ }
1520
+ }
1521
+
1522
+ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1523
+ if (dest_seq_id != -1) {
1524
+ // single sequence
1525
+
1526
+ seq_rm(dest_seq_id, -1, -1);
1527
+
1528
+ llama_batch_allocr balloc(hparams.n_pos_per_embd());
1529
+
1530
+ llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
1531
+
1532
+ for (uint32_t i = 0; i < cell_count; ++i) {
1533
+ llama_pos pos;
1534
+ uint32_t n_seq_id;
1535
+
1536
+ io.read_to(&pos, sizeof(pos));
1537
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1538
+
1539
+ if (n_seq_id != 1) {
1540
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1541
+ return false;
1542
+ }
1543
+
1544
+ // read the sequence id, but directly discard it - we will use dest_seq_id instead
1545
+ {
1546
+ llama_seq_id seq_id;
1547
+ io.read_to(&seq_id, sizeof(seq_id));
1548
+ }
1549
+
1550
+ ubatch.pos[i] = pos;
1551
+ ubatch.n_seq_id[i] = n_seq_id;
1552
+ ubatch.seq_id[i] = &dest_seq_id;
1553
+ }
1554
+
1555
+ const auto head_cur = find_slot(ubatch);
1556
+ if (head_cur < 0) {
1557
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1558
+ return false;
1559
+ }
1560
+
1561
+ apply_ubatch(head_cur, ubatch);
1562
+
1563
+ // keep the head at the old position because we will read the KV data into it in state_read_data()
1564
+ head = head_cur;
1565
+
1566
+ // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
1567
+ // Assume that this is one contiguous block of cells
1568
+ LM_GGML_ASSERT(head_cur + cell_count <= cells.size());
1569
+ LM_GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
1570
+ LM_GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
1571
+ LM_GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
1572
+ LM_GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
1573
+ } else {
1574
+ // whole KV cache restore
1575
+
1576
+ if (cell_count > cells.size()) {
1577
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1578
+ return false;
1579
+ }
1580
+
1581
+ clear(true);
1582
+
1583
+ for (uint32_t i = 0; i < cell_count; ++i) {
1584
+ llama_pos pos;
1585
+ uint32_t n_seq_id;
1586
+
1587
+ io.read_to(&pos, sizeof(pos));
1588
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1589
+
1590
+ cells.pos_set(i, pos);
1591
+
1592
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1593
+ llama_seq_id seq_id;
1594
+ io.read_to(&seq_id, sizeof(seq_id));
1595
+
1596
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
1597
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
1598
+ return false;
1599
+ }
1600
+
1601
+ cells.seq_add(i, seq_id);
1602
+ }
1603
+ }
1604
+
1605
+ head = 0;
1606
+ }
1607
+
1608
+ return true;
1609
+ }
1610
+
1611
+ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1612
+ uint32_t v_trans;
1613
+ uint32_t n_layer;
1614
+
1615
+ io.read_to(&v_trans, sizeof(v_trans));
1616
+ io.read_to(&n_layer, sizeof(n_layer));
1617
+
1618
+ if (n_layer != layers.size()) {
1619
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
1620
+ return false;
1621
+ }
1622
+
1623
+ if (cell_count > cells.size()) {
1624
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
1625
+ return false;
1626
+ }
1627
+
1628
+ if (this->v_trans != (bool) v_trans) {
1629
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1630
+ return false;
1631
+ }
1632
+
1633
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1634
+ for (const auto & layer : layers) {
1635
+ const uint32_t il = layer.il;
1636
+
1637
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1638
+
1639
+ // Read type of key
1640
+ int32_t k_type_i_ref;
1641
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1642
+ const int32_t k_type_i = (int32_t) layer.k->type;
1643
+ if (k_type_i != k_type_i_ref) {
1644
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1645
+ return false;
1646
+ }
1647
+
1648
+ // Read row size of key
1649
+ uint64_t k_size_row_ref;
1650
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1651
+ const size_t k_size_row = lm_ggml_row_size(layer.k->type, n_embd_k_gqa);
1652
+ if (k_size_row != k_size_row_ref) {
1653
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1654
+ return false;
1655
+ }
1656
+
1657
+ if (cell_count) {
1658
+ // Read and set the keys for the whole cell range
1659
+ lm_ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1660
+ }
1661
+ }
1662
+
1663
+ if (!this->v_trans) {
1664
+ for (const auto & layer : layers) {
1665
+ const uint32_t il = layer.il;
1666
+
1667
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1668
+
1669
+ // Read type of value
1670
+ int32_t v_type_i_ref;
1671
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1672
+ const int32_t v_type_i = (int32_t)layer.v->type;
1673
+ if (v_type_i != v_type_i_ref) {
1674
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1675
+ return false;
1676
+ }
1677
+
1678
+ // Read row size of value
1679
+ uint64_t v_size_row_ref;
1680
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1681
+ const size_t v_size_row = lm_ggml_row_size(layer.v->type, n_embd_v_gqa);
1682
+ if (v_size_row != v_size_row_ref) {
1683
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1684
+ return false;
1685
+ }
1686
+
1687
+ if (cell_count) {
1688
+ // Read and set the values for the whole cell range
1689
+ lm_ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1690
+ }
1691
+ }
1692
+ } else {
1693
+ // For each layer, read the values for each cell (transposed)
1694
+ for (const auto & layer : layers) {
1695
+ const uint32_t il = layer.il;
1696
+
1697
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1698
+
1699
+ // Read type of value
1700
+ int32_t v_type_i_ref;
1701
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1702
+ const int32_t v_type_i = (int32_t)layer.v->type;
1703
+ if (v_type_i != v_type_i_ref) {
1704
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1705
+ return false;
1706
+ }
1707
+
1708
+ // Read element size of value
1709
+ uint32_t v_size_el_ref;
1710
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1711
+ const size_t v_size_el = lm_ggml_type_size(layer.v->type);
1712
+ if (v_size_el != v_size_el_ref) {
1713
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1714
+ return false;
1715
+ }
1716
+
1717
+ // Read GQA embedding size
1718
+ uint32_t n_embd_v_gqa_ref;
1719
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1720
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1721
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1722
+ return false;
1723
+ }
1724
+
1725
+ if (cell_count) {
1726
+ // For each row in the transposed matrix, read the values for the whole cell range
1727
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1728
+ const size_t dst_offset = (head + j * cells.size()) * v_size_el;
1729
+ lm_ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1730
+ }
1731
+ }
1732
+ }
1733
+ }
1734
+
1735
+ return true;
1736
+ }
1737
+
1738
+ //
1739
+ // llama_kv_cache_unified_context
1740
+ //
1741
+
1742
+ llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {}
1743
+
1744
+ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
1745
+ llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
1746
+ n_kv = kv->get_size();
1747
+ head = 0;
1748
+ }
1749
+
1750
+ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
1751
+ llama_kv_cache_unified * kv,
1752
+ llama_context * lctx,
1753
+ bool do_shift,
1754
+ defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
1755
+ if (!do_shift && this->dinfo.empty()) {
1756
+ status = LLAMA_MEMORY_STATUS_NO_UPDATE;
1757
+ }
1758
+ }
1759
+
1760
+ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
1761
+ llama_kv_cache_unified * kv,
1762
+ llama_kv_cache_unified::ubatch_heads heads,
1763
+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) {
1764
+ }
1765
+
1766
+ llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
1767
+
1768
+ bool llama_kv_cache_unified_context::next() {
1769
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1770
+
1771
+ if (++i_next >= ubatches.size()) {
1772
+ return false;
1773
+ }
1774
+
1775
+ return true;
1776
+ }
1777
+
1778
+ bool llama_kv_cache_unified_context::apply() {
1779
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1780
+
1781
+ // no ubatches -> this is a KV cache update
1782
+ if (ubatches.empty()) {
1783
+ kv->update(lctx, do_shift, dinfo);
1784
+
1785
+ return true;
1786
+ }
1787
+
1788
+ kv->apply_ubatch(heads[i_next], ubatches[i_next]);
1789
+
1790
+ n_kv = kv->get_n_kv();
1791
+ head = heads[i_next];
1792
+
1793
+ return true;
1794
+ }
1795
+
1796
+ llama_memory_status llama_kv_cache_unified_context::get_status() const {
1797
+ return status;
1798
+ }
1799
+
1800
+ const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const {
1801
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1802
+
1803
+ return ubatches[i_next];
1804
+ }
1805
+
1806
+ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
1807
+ return n_kv;
1808
+ }
1809
+
1810
+ lm_ggml_tensor * llama_kv_cache_unified_context::get_k(lm_ggml_context * ctx, int32_t il) const {
1811
+ return kv->get_k(ctx, il, n_kv);
1812
+ }
1813
+
1814
+ lm_ggml_tensor * llama_kv_cache_unified_context::get_v(lm_ggml_context * ctx, int32_t il) const {
1815
+ return kv->get_v(ctx, il, n_kv);
1816
+ }
1817
+
1818
+ lm_ggml_tensor * llama_kv_cache_unified_context::cpy_k(lm_ggml_context * ctx, lm_ggml_tensor * k_cur, int32_t il) const {
1819
+ return kv->cpy_k(ctx, k_cur, il, head);
1820
+ }
1821
+
1822
+ lm_ggml_tensor * llama_kv_cache_unified_context::cpy_v(lm_ggml_context * ctx, lm_ggml_tensor * v_cur, int32_t il) const {
1823
+ return kv->cpy_v(ctx, v_cur, il, head);
1824
+ }
1825
+
1826
+ void llama_kv_cache_unified_context::set_input_k_shift(lm_ggml_tensor * dst) const {
1827
+ kv->set_input_k_shift(dst);
1828
+ }
1829
+
1830
+ void llama_kv_cache_unified_context::set_input_kq_mask(lm_ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1831
+ kv->set_input_kq_mask(dst, ubatch, causal_attn);
1832
+ }
1833
+
1834
+ void llama_kv_cache_unified_context::set_input_pos_bucket(lm_ggml_tensor * dst, const llama_ubatch * ubatch) const {
1835
+ kv->set_input_pos_bucket(dst, ubatch);
1836
+ }
1837
+
1838
+ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
1839
+ // the FA kernels require padding to avoid extra runtime boundary checks
1840
+ return cparams.flash_attn ? 256u : 32u;
1841
+ }