@novastera-oss/llamarn 0.0.1-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (989) hide show
  1. package/INTERFACE.md +389 -0
  2. package/LICENSE +201 -0
  3. package/README.md +235 -0
  4. package/RNLlamaCpp.podspec +69 -0
  5. package/android/CMakeLists.txt +107 -0
  6. package/android/build.gradle +111 -0
  7. package/android/generated/java/com/novastera/llamarn/NativeRNLlamaCppSpec.java +47 -0
  8. package/android/generated/jni/CMakeLists.txt +36 -0
  9. package/android/generated/jni/RNLlamaCppSpec-generated.cpp +44 -0
  10. package/android/generated/jni/RNLlamaCppSpec.h +31 -0
  11. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI-generated.cpp +42 -0
  12. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +336 -0
  13. package/android/gradle.properties +5 -0
  14. package/android/src/main/AndroidManifest.xml +3 -0
  15. package/android/src/main/AndroidManifestNew.xml +2 -0
  16. package/android/src/main/cpp/include/llama-cpp.h +30 -0
  17. package/android/src/main/cpp/include/llama.h +1440 -0
  18. package/android/src/main/java/com/novastera/llamarn/RNLlamaCppPackage.kt +21 -0
  19. package/android/src/main/jniLibs/arm64-v8a/libOpenCL.so +0 -0
  20. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  21. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  22. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  23. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  24. package/android/src/main/jniLibs/x86_64/libOpenCL.so +0 -0
  25. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  26. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  27. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  28. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  29. package/cpp/LlamaCppModel.cpp +984 -0
  30. package/cpp/LlamaCppModel.h +162 -0
  31. package/cpp/PureCppImpl.cpp +308 -0
  32. package/cpp/PureCppImpl.h +59 -0
  33. package/cpp/SystemUtils.cpp +180 -0
  34. package/cpp/SystemUtils.h +74 -0
  35. package/cpp/build-info.cpp +4 -0
  36. package/cpp/llama.cpp/AUTHORS +1106 -0
  37. package/cpp/llama.cpp/CMakeLists.txt +254 -0
  38. package/cpp/llama.cpp/CMakePresets.json +84 -0
  39. package/cpp/llama.cpp/CODEOWNERS +11 -0
  40. package/cpp/llama.cpp/CONTRIBUTING.md +127 -0
  41. package/cpp/llama.cpp/LICENSE +21 -0
  42. package/cpp/llama.cpp/Makefile +1608 -0
  43. package/cpp/llama.cpp/README.md +575 -0
  44. package/cpp/llama.cpp/SECURITY.md +68 -0
  45. package/cpp/llama.cpp/build-xcframework.sh +540 -0
  46. package/cpp/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  47. package/cpp/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  48. package/cpp/llama.cpp/cmake/build-info.cmake +64 -0
  49. package/cpp/llama.cpp/cmake/common.cmake +35 -0
  50. package/cpp/llama.cpp/cmake/git-vars.cmake +22 -0
  51. package/cpp/llama.cpp/cmake/llama-config.cmake.in +30 -0
  52. package/cpp/llama.cpp/cmake/llama.pc.in +10 -0
  53. package/cpp/llama.cpp/cmake/x64-windows-llvm.cmake +5 -0
  54. package/cpp/llama.cpp/common/CMakeLists.txt +170 -0
  55. package/cpp/llama.cpp/common/arg.cpp +3337 -0
  56. package/cpp/llama.cpp/common/arg.h +89 -0
  57. package/cpp/llama.cpp/common/base64.hpp +392 -0
  58. package/cpp/llama.cpp/common/build-info.cpp.in +4 -0
  59. package/cpp/llama.cpp/common/chat.cpp +1781 -0
  60. package/cpp/llama.cpp/common/chat.h +135 -0
  61. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +24 -0
  62. package/cpp/llama.cpp/common/common.cpp +1567 -0
  63. package/cpp/llama.cpp/common/common.h +668 -0
  64. package/cpp/llama.cpp/common/console.cpp +504 -0
  65. package/cpp/llama.cpp/common/console.h +19 -0
  66. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +1027 -0
  67. package/cpp/llama.cpp/common/json-schema-to-grammar.h +21 -0
  68. package/cpp/llama.cpp/common/json.hpp +24766 -0
  69. package/cpp/llama.cpp/common/llguidance.cpp +254 -0
  70. package/cpp/llama.cpp/common/log.cpp +393 -0
  71. package/cpp/llama.cpp/common/log.h +103 -0
  72. package/cpp/llama.cpp/common/minja/chat-template.hpp +537 -0
  73. package/cpp/llama.cpp/common/minja/minja.hpp +2941 -0
  74. package/cpp/llama.cpp/common/ngram-cache.cpp +286 -0
  75. package/cpp/llama.cpp/common/ngram-cache.h +101 -0
  76. package/cpp/llama.cpp/common/sampling.cpp +580 -0
  77. package/cpp/llama.cpp/common/sampling.h +107 -0
  78. package/cpp/llama.cpp/common/speculative.cpp +278 -0
  79. package/cpp/llama.cpp/common/speculative.h +28 -0
  80. package/cpp/llama.cpp/common/stb_image.h +7988 -0
  81. package/cpp/llama.cpp/convert_hf_to_gguf.py +6195 -0
  82. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +393 -0
  83. package/cpp/llama.cpp/convert_llama_ggml_to_gguf.py +450 -0
  84. package/cpp/llama.cpp/convert_lora_to_gguf.py +461 -0
  85. package/cpp/llama.cpp/flake.lock +58 -0
  86. package/cpp/llama.cpp/flake.nix +185 -0
  87. package/cpp/llama.cpp/ggml/CMakeLists.txt +388 -0
  88. package/cpp/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  89. package/cpp/llama.cpp/ggml/cmake/common.cmake +26 -0
  90. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +152 -0
  91. package/cpp/llama.cpp/ggml/include/ggml-alloc.h +76 -0
  92. package/cpp/llama.cpp/ggml/include/ggml-backend.h +354 -0
  93. package/cpp/llama.cpp/ggml/include/ggml-blas.h +25 -0
  94. package/cpp/llama.cpp/ggml/include/ggml-cann.h +123 -0
  95. package/cpp/llama.cpp/ggml/include/ggml-cpp.h +39 -0
  96. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +143 -0
  97. package/cpp/llama.cpp/ggml/include/ggml-cuda.h +47 -0
  98. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +50 -0
  99. package/cpp/llama.cpp/ggml/include/ggml-metal.h +66 -0
  100. package/cpp/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  101. package/cpp/llama.cpp/ggml/include/ggml-opt.h +216 -0
  102. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +33 -0
  103. package/cpp/llama.cpp/ggml/include/ggml-sycl.h +49 -0
  104. package/cpp/llama.cpp/ggml/include/ggml-vulkan.h +29 -0
  105. package/cpp/llama.cpp/ggml/include/ggml.h +2192 -0
  106. package/cpp/llama.cpp/ggml/include/gguf.h +202 -0
  107. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +345 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1042 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +255 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +586 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +2008 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +2579 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +179 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +258 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2589 -0
  119. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +1083 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +420 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +2554 -0
  122. package/cpp/llama.cpp/ggml/src/ggml-common.h +1857 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +495 -0
  124. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  131. package/cpp/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  132. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +327 -0
  134. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  135. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  136. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +512 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +13131 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +3492 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +254 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +60 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +287 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +47 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cuh +5 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/arange.cu +34 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/arange.cuh +5 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/argmax.cu +91 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/argmax.cuh +3 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +104 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +3 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +363 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/clamp.cu +45 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/clamp.cuh +5 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +828 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/concat.cu +221 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/concat.cuh +5 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +730 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +26 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/count-equal.cu +64 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +695 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +11 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/diagmask.cu +40 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +873 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1269 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +437 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +428 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +345 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +3 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +275 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cuh +15 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +3501 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/gla.cu +93 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/gla.cuh +3 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +103 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +5 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +396 -0
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +322 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +336 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +12 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +595 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +458 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +11 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-cuda/out-prod.cu +68 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +49 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cuh +5 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-cuda/pool2d.cu +94 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +189 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +27 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +456 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +7 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +31 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cuh +5 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +283 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cuh +7 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +45 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cuh +5 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +39 -0
  239. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  240. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  241. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  249. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  253. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  254. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  256. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  258. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  259. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  260. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  263. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  265. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  266. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  274. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  275. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  276. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  277. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  278. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  280. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  282. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  283. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  284. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  285. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  286. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  289. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  290. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  291. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  292. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  293. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  294. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  295. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  296. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  297. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  298. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  299. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  300. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  311. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  314. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  315. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  319. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  320. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  321. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  322. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  324. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  325. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  327. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  329. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  330. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  333. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  334. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  335. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  336. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  337. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  338. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  339. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  340. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  341. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  343. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  344. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  345. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  346. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  347. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  348. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  349. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  350. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  351. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  352. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  353. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  354. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  355. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  356. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  357. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  358. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  359. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  360. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  361. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  362. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  363. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  364. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +47 -0
  365. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  366. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +279 -0
  367. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +57 -0
  368. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +51 -0
  369. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cuh +5 -0
  370. package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  371. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +243 -0
  373. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +140 -0
  374. package/cpp/llama.cpp/ggml/src/ggml-cuda/wkv.cu +199 -0
  375. package/cpp/llama.cpp/ggml/src/ggml-cuda/wkv.cuh +7 -0
  376. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  377. package/cpp/llama.cpp/ggml/src/ggml-impl.h +601 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  379. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  380. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  381. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  382. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  383. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  384. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  385. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  386. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  387. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  388. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  389. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  392. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  394. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  395. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  397. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  398. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  401. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  402. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  403. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  404. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  405. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  406. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  407. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  408. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  409. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  410. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  411. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  412. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  413. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  414. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  416. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  417. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +618 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +5916 -0
  420. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +6891 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  423. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4966 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  425. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  426. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  427. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  428. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  429. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  430. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  431. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  432. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  433. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  434. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  435. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  436. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  437. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  438. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  441. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  442. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  443. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  445. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  447. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  448. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  449. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  450. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  451. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  452. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  453. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  455. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  456. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  460. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  461. package/cpp/llama.cpp/ggml/src/ggml-quants.c +5232 -0
  462. package/cpp/llama.cpp/ggml/src/ggml-quants.h +100 -0
  463. package/cpp/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  464. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +1813 -0
  465. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +183 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +37 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  468. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  469. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.cpp +83 -0
  470. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +493 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +197 -0
  472. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.hpp +20 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +100 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.hpp +20 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +596 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.hpp +34 -0
  477. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +753 -0
  480. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1154 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1559 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +70 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +311 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +20 -0
  488. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +4302 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  491. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +136 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +21 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  495. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1081 -0
  496. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  497. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +474 -0
  498. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +26 -0
  499. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +46 -0
  500. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +10 -0
  501. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +74 -0
  502. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +61 -0
  503. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +362 -0
  504. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.hpp +20 -0
  505. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +264 -0
  506. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +20 -0
  507. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  508. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  509. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  510. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  511. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1189 -0
  512. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  513. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  514. package/cpp/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  515. package/cpp/llama.cpp/ggml/src/ggml-threading.h +14 -0
  516. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +202 -0
  517. package/cpp/llama.cpp/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  518. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10502 -0
  519. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +22 -0
  520. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  521. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  522. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  523. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  524. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  525. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  526. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  527. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  528. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  529. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  530. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  531. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  532. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  533. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  534. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  535. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  536. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  537. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  538. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  539. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  540. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  541. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  542. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  543. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  544. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  545. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  546. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  547. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  548. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  549. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  550. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  551. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  552. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  553. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  554. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  555. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  556. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  557. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  558. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +483 -0
  559. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +383 -0
  560. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  561. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  562. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  563. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  564. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  565. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  566. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  567. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  568. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  569. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  570. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  571. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  572. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  573. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  574. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  575. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  576. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  577. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  578. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  579. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  580. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  581. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  582. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  583. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  584. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  585. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  586. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  587. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  588. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  589. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  590. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  591. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  592. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  593. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  594. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  595. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  596. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  597. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  598. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  599. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  600. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  601. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  602. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  603. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  604. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  605. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  606. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  607. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  608. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  609. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  610. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  611. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  612. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  613. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  614. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  615. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  616. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  617. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  618. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  619. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  620. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  621. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  622. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  623. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  624. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  625. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  626. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  627. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +740 -0
  628. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  629. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  630. package/cpp/llama.cpp/ggml/src/ggml.c +6499 -0
  631. package/cpp/llama.cpp/ggml/src/gguf.cpp +1330 -0
  632. package/cpp/llama.cpp/gguf-py/LICENSE +21 -0
  633. package/cpp/llama.cpp/gguf-py/README.md +99 -0
  634. package/cpp/llama.cpp/gguf-py/examples/reader.py +49 -0
  635. package/cpp/llama.cpp/gguf-py/examples/writer.py +39 -0
  636. package/cpp/llama.cpp/gguf-py/gguf/__init__.py +9 -0
  637. package/cpp/llama.cpp/gguf-py/gguf/constants.py +2296 -0
  638. package/cpp/llama.cpp/gguf-py/gguf/gguf.py +15 -0
  639. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +367 -0
  640. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +1041 -0
  641. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +223 -0
  642. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +642 -0
  643. package/cpp/llama.cpp/gguf-py/gguf/py.typed +0 -0
  644. package/cpp/llama.cpp/gguf-py/gguf/quants.py +1269 -0
  645. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +182 -0
  646. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +454 -0
  647. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +1610 -0
  648. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_hash.py +102 -0
  649. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +207 -0
  650. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_set_metadata.py +95 -0
  651. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +1172 -0
  652. package/cpp/llama.cpp/gguf-py/gguf/utility.py +264 -0
  653. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +492 -0
  654. package/cpp/llama.cpp/gguf-py/pyproject.toml +43 -0
  655. package/cpp/llama.cpp/gguf-py/tests/__init__.py +1 -0
  656. package/cpp/llama.cpp/gguf-py/tests/test_metadata.py +238 -0
  657. package/cpp/llama.cpp/gguf-py/tests/test_quants.py +238 -0
  658. package/cpp/llama.cpp/grammars/README.md +382 -0
  659. package/cpp/llama.cpp/grammars/arithmetic.gbnf +6 -0
  660. package/cpp/llama.cpp/grammars/c.gbnf +42 -0
  661. package/cpp/llama.cpp/grammars/chess.gbnf +13 -0
  662. package/cpp/llama.cpp/grammars/english.gbnf +6 -0
  663. package/cpp/llama.cpp/grammars/japanese.gbnf +7 -0
  664. package/cpp/llama.cpp/grammars/json.gbnf +25 -0
  665. package/cpp/llama.cpp/grammars/json_arr.gbnf +34 -0
  666. package/cpp/llama.cpp/grammars/list.gbnf +4 -0
  667. package/cpp/llama.cpp/include/llama-cpp.h +30 -0
  668. package/cpp/llama.cpp/include/llama.h +1440 -0
  669. package/cpp/llama.cpp/licenses/LICENSE-curl +9 -0
  670. package/cpp/llama.cpp/licenses/LICENSE-httplib +21 -0
  671. package/cpp/llama.cpp/licenses/LICENSE-jsonhpp +21 -0
  672. package/cpp/llama.cpp/licenses/LICENSE-linenoise +26 -0
  673. package/cpp/llama.cpp/media/llama0-banner.png +0 -0
  674. package/cpp/llama.cpp/media/llama0-logo.png +0 -0
  675. package/cpp/llama.cpp/media/llama1-banner.png +0 -0
  676. package/cpp/llama.cpp/media/llama1-logo.png +0 -0
  677. package/cpp/llama.cpp/media/llama1-logo.svg +34 -0
  678. package/cpp/llama.cpp/media/matmul.png +0 -0
  679. package/cpp/llama.cpp/media/matmul.svg +1238 -0
  680. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  681. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  682. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  683. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  684. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  685. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  686. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  687. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  688. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  689. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  690. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  691. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  692. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  693. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  694. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  695. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  696. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  697. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  698. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  699. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  700. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  701. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  702. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  703. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  704. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  705. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  706. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  707. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  708. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  709. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  710. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  711. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  712. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  713. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  714. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  715. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  716. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  717. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  718. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  719. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  720. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  721. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  722. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  723. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  724. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  725. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  726. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  727. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  728. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  729. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  730. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  731. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  732. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  733. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  734. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +202 -0
  735. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +156 -0
  736. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +152 -0
  737. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +152 -0
  738. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +54 -0
  739. package/cpp/llama.cpp/models/templates/README.md +22 -0
  740. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +1 -0
  741. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +1 -0
  742. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +57 -0
  743. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +4 -0
  744. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +76 -0
  745. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +58 -0
  746. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +287 -0
  747. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +109 -0
  748. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +93 -0
  749. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +109 -0
  750. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +8 -0
  751. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +87 -0
  752. package/cpp/llama.cpp/mypy.ini +7 -0
  753. package/cpp/llama.cpp/pocs/CMakeLists.txt +14 -0
  754. package/cpp/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  755. package/cpp/llama.cpp/pocs/vdot/q8dot.cpp +173 -0
  756. package/cpp/llama.cpp/pocs/vdot/vdot.cpp +311 -0
  757. package/cpp/llama.cpp/poetry.lock +1197 -0
  758. package/cpp/llama.cpp/prompts/LLM-questions.txt +49 -0
  759. package/cpp/llama.cpp/prompts/alpaca.txt +1 -0
  760. package/cpp/llama.cpp/prompts/assistant.txt +31 -0
  761. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  762. package/cpp/llama.cpp/prompts/chat-with-bob.txt +7 -0
  763. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  764. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  765. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  766. package/cpp/llama.cpp/prompts/chat.txt +28 -0
  767. package/cpp/llama.cpp/prompts/dan-modified.txt +1 -0
  768. package/cpp/llama.cpp/prompts/dan.txt +1 -0
  769. package/cpp/llama.cpp/prompts/mnemonics.txt +93 -0
  770. package/cpp/llama.cpp/prompts/parallel-questions.txt +43 -0
  771. package/cpp/llama.cpp/prompts/reason-act.txt +18 -0
  772. package/cpp/llama.cpp/pyproject.toml +45 -0
  773. package/cpp/llama.cpp/pyrightconfig.json +22 -0
  774. package/cpp/llama.cpp/requirements/requirements-all.txt +15 -0
  775. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  776. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  777. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  778. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +5 -0
  779. package/cpp/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  780. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  781. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  782. package/cpp/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  783. package/cpp/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  784. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  785. package/cpp/llama.cpp/requirements.txt +13 -0
  786. package/cpp/llama.cpp/src/CMakeLists.txt +45 -0
  787. package/cpp/llama.cpp/src/llama-adapter.cpp +388 -0
  788. package/cpp/llama.cpp/src/llama-adapter.h +76 -0
  789. package/cpp/llama.cpp/src/llama-arch.cpp +1743 -0
  790. package/cpp/llama.cpp/src/llama-arch.h +437 -0
  791. package/cpp/llama.cpp/src/llama-batch.cpp +372 -0
  792. package/cpp/llama.cpp/src/llama-batch.h +89 -0
  793. package/cpp/llama.cpp/src/llama-chat.cpp +663 -0
  794. package/cpp/llama.cpp/src/llama-chat.h +58 -0
  795. package/cpp/llama.cpp/src/llama-context.cpp +2459 -0
  796. package/cpp/llama.cpp/src/llama-context.h +246 -0
  797. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -0
  798. package/cpp/llama.cpp/src/llama-cparams.h +39 -0
  799. package/cpp/llama.cpp/src/llama-grammar.cpp +1219 -0
  800. package/cpp/llama.cpp/src/llama-grammar.h +173 -0
  801. package/cpp/llama.cpp/src/llama-graph.cpp +1713 -0
  802. package/cpp/llama.cpp/src/llama-graph.h +595 -0
  803. package/cpp/llama.cpp/src/llama-hparams.cpp +79 -0
  804. package/cpp/llama.cpp/src/llama-hparams.h +161 -0
  805. package/cpp/llama.cpp/src/llama-impl.cpp +167 -0
  806. package/cpp/llama.cpp/src/llama-impl.h +61 -0
  807. package/cpp/llama.cpp/src/llama-io.cpp +15 -0
  808. package/cpp/llama.cpp/src/llama-io.h +35 -0
  809. package/cpp/llama.cpp/src/llama-kv-cache.cpp +2486 -0
  810. package/cpp/llama.cpp/src/llama-kv-cache.h +405 -0
  811. package/cpp/llama.cpp/src/llama-memory.cpp +1 -0
  812. package/cpp/llama.cpp/src/llama-memory.h +31 -0
  813. package/cpp/llama.cpp/src/llama-mmap.cpp +600 -0
  814. package/cpp/llama.cpp/src/llama-mmap.h +68 -0
  815. package/cpp/llama.cpp/src/llama-model-loader.cpp +1133 -0
  816. package/cpp/llama.cpp/src/llama-model-loader.h +169 -0
  817. package/cpp/llama.cpp/src/llama-model.cpp +13453 -0
  818. package/cpp/llama.cpp/src/llama-model.h +420 -0
  819. package/cpp/llama.cpp/src/llama-quant.cpp +964 -0
  820. package/cpp/llama.cpp/src/llama-quant.h +1 -0
  821. package/cpp/llama.cpp/src/llama-sampling.cpp +2575 -0
  822. package/cpp/llama.cpp/src/llama-sampling.h +32 -0
  823. package/cpp/llama.cpp/src/llama-vocab.cpp +3313 -0
  824. package/cpp/llama.cpp/src/llama-vocab.h +125 -0
  825. package/cpp/llama.cpp/src/llama.cpp +340 -0
  826. package/cpp/llama.cpp/src/unicode-data.cpp +7034 -0
  827. package/cpp/llama.cpp/src/unicode-data.h +20 -0
  828. package/cpp/llama.cpp/src/unicode.cpp +849 -0
  829. package/cpp/llama.cpp/src/unicode.h +66 -0
  830. package/cpp/rn-completion.cpp +431 -0
  831. package/cpp/rn-llama.hpp +60 -0
  832. package/cpp/rn-utils.hpp +331 -0
  833. package/ios/OnLoad.mm +22 -0
  834. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec-generated.mm +64 -0
  835. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +251 -0
  836. package/ios/generated/RNLlamaCppSpecJSI-generated.cpp +42 -0
  837. package/ios/generated/RNLlamaCppSpecJSI.h +336 -0
  838. package/ios/include/chat.h +135 -0
  839. package/ios/include/common/base64.hpp +392 -0
  840. package/ios/include/common/json.hpp +24766 -0
  841. package/ios/include/common/minja/chat-template.hpp +537 -0
  842. package/ios/include/common/minja/minja.hpp +2941 -0
  843. package/ios/include/common.h +668 -0
  844. package/ios/include/json-schema-to-grammar.h +21 -0
  845. package/ios/include/llama-cpp.h +30 -0
  846. package/ios/include/llama.h +1440 -0
  847. package/ios/include/log.h +103 -0
  848. package/ios/include/ngram-cache.h +101 -0
  849. package/ios/include/sampling.h +107 -0
  850. package/ios/include/speculative.h +28 -0
  851. package/ios/libs/llama.xcframework/Info.plist +135 -0
  852. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  853. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  854. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4492 -0
  855. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-alloc.h +76 -0
  856. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +354 -0
  857. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-blas.h +25 -0
  858. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +143 -0
  859. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +66 -0
  860. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +2192 -0
  861. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/gguf.h +202 -0
  862. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +1440 -0
  863. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Info.plist +36 -0
  864. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Modules/module.modulemap +17 -0
  865. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  866. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  867. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4513 -0
  869. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3440 -0
  870. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +76 -0
  871. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +354 -0
  872. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +25 -0
  873. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +143 -0
  874. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +66 -0
  875. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +2192 -0
  876. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +202 -0
  877. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +1440 -0
  878. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Info.plist +36 -0
  879. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +17 -0
  880. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4513 -0
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3442 -0
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +76 -0
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +354 -0
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +25 -0
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +143 -0
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +66 -0
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +2192 -0
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +202 -0
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +1440 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +17 -0
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +32 -0
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +76 -0
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +354 -0
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +25 -0
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +143 -0
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +66 -0
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +2192 -0
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +202 -0
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +1440 -0
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +17 -0
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +32 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  906. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +76 -0
  907. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +354 -0
  908. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +25 -0
  909. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +143 -0
  910. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +66 -0
  911. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +2192 -0
  912. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +202 -0
  913. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +1440 -0
  914. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +17 -0
  915. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +32 -0
  916. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  917. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  918. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  919. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  920. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4492 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +76 -0
  922. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +354 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +25 -0
  924. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +143 -0
  925. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +66 -0
  926. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +2192 -0
  927. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +202 -0
  928. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +1440 -0
  929. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +35 -0
  930. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +17 -0
  931. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4513 -0
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3440 -0
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +76 -0
  937. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +354 -0
  938. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +25 -0
  939. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +143 -0
  940. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +66 -0
  941. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +2192 -0
  942. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +202 -0
  943. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +1440 -0
  944. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +35 -0
  945. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +17 -0
  946. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  947. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  948. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  949. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4528 -0
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +76 -0
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +354 -0
  952. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +25 -0
  953. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +143 -0
  954. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +66 -0
  955. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +2192 -0
  956. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +202 -0
  957. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +1440 -0
  958. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +32 -0
  959. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +17 -0
  960. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +20 -0
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4549 -0
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3470 -0
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +76 -0
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +354 -0
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +25 -0
  968. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +143 -0
  969. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +66 -0
  970. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +2192 -0
  971. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +202 -0
  972. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +1440 -0
  973. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +32 -0
  974. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +17 -0
  975. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  976. package/lib/module/NativeRNLlamaCpp.js +35 -0
  977. package/lib/module/NativeRNLlamaCpp.js.map +1 -0
  978. package/lib/module/index.js +20 -0
  979. package/lib/module/index.js.map +1 -0
  980. package/lib/module/package.json +1 -0
  981. package/lib/typescript/package.json +1 -0
  982. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +222 -0
  983. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -0
  984. package/lib/typescript/src/index.d.ts +5 -0
  985. package/lib/typescript/src/index.d.ts.map +1 -0
  986. package/package.json +161 -0
  987. package/react-native.config.js +15 -0
  988. package/src/NativeRNLlamaCpp.ts +282 -0
  989. package/src/index.tsx +54 -0
@@ -0,0 +1,2486 @@
1
+ #include "llama-kv-cache.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-context.h"
8
+
9
+ #include <algorithm>
10
+ #include <cassert>
11
+ #include <cmath>
12
+ #include <limits>
13
+ #include <map>
14
+ #include <stdexcept>
15
+
16
+ //
17
+ // llama_kv_cache_unified
18
+ //
19
+
20
+ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
21
+ // the FA kernels require padding to avoid extra runtime boundary checks
22
+ return cparams.flash_attn ? 256u : 32u;
23
+ }
24
+
25
+ llama_kv_cache_unified::llama_kv_cache_unified(
26
+ const llama_model & model,
27
+ ggml_type type_k,
28
+ ggml_type type_v,
29
+ bool v_trans,
30
+ bool offload,
31
+ uint32_t kv_size,
32
+ uint32_t padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
33
+ const int32_t n_layer = hparams.n_layer;
34
+
35
+ has_shift = false;
36
+ can_shift = true;
37
+
38
+ LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
39
+ __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
40
+
41
+ GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
42
+
43
+ head = 0;
44
+ size = kv_size;
45
+ used = 0;
46
+
47
+ this->type_k = type_k;
48
+ this->type_v = type_v;
49
+
50
+ cells.clear();
51
+ cells.resize(kv_size);
52
+
53
+ // create a context for each buffer type
54
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
55
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
56
+ auto it = ctx_map.find(buft);
57
+ if (it == ctx_map.end()) {
58
+ ggml_init_params params = {
59
+ /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
60
+ /*.mem_buffer =*/ NULL,
61
+ /*.no_alloc =*/ true,
62
+ };
63
+
64
+ ggml_context * ctx = ggml_init(params);
65
+ if (!ctx) {
66
+ return nullptr;
67
+ }
68
+
69
+ ctx_map[buft] = ctx;
70
+ ctxs.emplace_back(ctx);
71
+
72
+ return ctx;
73
+ }
74
+
75
+ return it->second;
76
+ };
77
+
78
+ k_l.reserve(n_layer);
79
+ v_l.reserve(n_layer);
80
+
81
+ for (int i = 0; i < n_layer; i++) {
82
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
83
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
84
+
85
+ const char * dev_name = "CPU";
86
+
87
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
88
+
89
+ if (offload) {
90
+ auto * dev = model.dev_layer(i);
91
+ buft = ggml_backend_dev_buffer_type(dev);
92
+
93
+ dev_name = ggml_backend_dev_name(dev);
94
+ }
95
+
96
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
97
+
98
+ ggml_context * ctx = ctx_for_buft(buft);
99
+ if (!ctx) {
100
+ throw std::runtime_error("failed to create ggml context for kv cache");
101
+ }
102
+
103
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
104
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
105
+ ggml_format_name(k, "cache_k_l%d", i);
106
+ ggml_format_name(v, "cache_v_l%d", i);
107
+ k_l.push_back(k);
108
+ v_l.push_back(v);
109
+ }
110
+
111
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
112
+ for (auto it : ctx_map) {
113
+ auto * buft = it.first;
114
+ auto * ctx = it.second;
115
+
116
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
117
+ if (!buf) {
118
+ throw std::runtime_error("failed to allocate buffer for kv cache");
119
+ }
120
+ ggml_backend_buffer_clear(buf, 0);
121
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
122
+ bufs.emplace_back(buf);
123
+ }
124
+
125
+ {
126
+ const size_t memory_size_k = size_k_bytes();
127
+ const size_t memory_size_v = size_v_bytes();
128
+
129
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
130
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
131
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
132
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
133
+ }
134
+ }
135
+
136
+ void llama_kv_cache_unified::clear() {
137
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
138
+ cells[i].pos = -1;
139
+ cells[i].seq_id.clear();
140
+ }
141
+ head = 0;
142
+ used = 0;
143
+
144
+ for (auto & buf : bufs) {
145
+ ggml_backend_buffer_clear(buf.get(), 0);
146
+ }
147
+ }
148
+
149
+ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
150
+ uint32_t new_head = size;
151
+
152
+ if (p0 < 0) {
153
+ p0 = 0;
154
+ }
155
+
156
+ if (p1 < 0) {
157
+ p1 = std::numeric_limits<llama_pos>::max();
158
+ }
159
+
160
+ for (uint32_t i = 0; i < size; ++i) {
161
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
162
+ if (seq_id < 0) {
163
+ cells[i].seq_id.clear();
164
+ } else if (cells[i].has_seq_id(seq_id)) {
165
+ cells[i].seq_id.erase(seq_id);
166
+ } else {
167
+ continue;
168
+ }
169
+ if (cells[i].is_empty()) {
170
+ // keep count of the number of used cells
171
+ if (cells[i].pos >= 0) {
172
+ used--;
173
+ }
174
+
175
+ cells[i].pos = -1;
176
+
177
+ if (new_head == size) {
178
+ new_head = i;
179
+ }
180
+ }
181
+ }
182
+ }
183
+
184
+ // If we freed up a slot, set head to it so searching can start there.
185
+ if (new_head != size && new_head < head) {
186
+ head = new_head;
187
+ }
188
+
189
+ return true;
190
+ }
191
+
192
+ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
193
+ if (seq_id_src == seq_id_dst) {
194
+ return;
195
+ }
196
+
197
+ if (p0 < 0) {
198
+ p0 = 0;
199
+ }
200
+
201
+ if (p1 < 0) {
202
+ p1 = std::numeric_limits<llama_pos>::max();
203
+ }
204
+
205
+ // otherwise, this is the KV of a Transformer-like model
206
+ head = 0;
207
+
208
+ for (uint32_t i = 0; i < size; ++i) {
209
+ if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
210
+ cells[i].seq_id.insert(seq_id_dst);
211
+ }
212
+ }
213
+ }
214
+
215
+ void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
216
+ uint32_t new_head = size;
217
+
218
+ for (uint32_t i = 0; i < size; ++i) {
219
+ if (!cells[i].has_seq_id(seq_id)) {
220
+ if (cells[i].pos >= 0) {
221
+ used--;
222
+ }
223
+
224
+ cells[i].pos = -1;
225
+ cells[i].seq_id.clear();
226
+
227
+ if (new_head == size){
228
+ new_head = i;
229
+ }
230
+ } else {
231
+ cells[i].seq_id.clear();
232
+ cells[i].seq_id.insert(seq_id);
233
+ }
234
+ }
235
+
236
+ // If we freed up a slot, set head to it so searching can start there.
237
+ if (new_head != size && new_head < head) {
238
+ head = new_head;
239
+ }
240
+ }
241
+
242
+ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
243
+ if (delta == 0) {
244
+ return;
245
+ }
246
+
247
+ uint32_t new_head = size;
248
+
249
+ if (p0 < 0) {
250
+ p0 = 0;
251
+ }
252
+
253
+ if (p1 < 0) {
254
+ p1 = std::numeric_limits<llama_pos>::max();
255
+ }
256
+
257
+ // If there is no range then return early to avoid looping over the
258
+ if (p0 == p1) {
259
+ return;
260
+ }
261
+
262
+ for (uint32_t i = 0; i < size; ++i) {
263
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
264
+ has_shift = true;
265
+ cells[i].pos += delta;
266
+ cells[i].delta += delta;
267
+
268
+ if (cells[i].pos < 0) {
269
+ if (!cells[i].is_empty()) {
270
+ used--;
271
+ }
272
+ cells[i].pos = -1;
273
+ cells[i].seq_id.clear();
274
+ if (new_head == size) {
275
+ new_head = i;
276
+ }
277
+ }
278
+ }
279
+ }
280
+
281
+ // If we freed up a slot, set head to it so searching can start there.
282
+ // Otherwise we just start the next search from the beginning.
283
+ head = new_head != size ? new_head : 0;
284
+ }
285
+
286
+ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
287
+ if (d == 1) {
288
+ return;
289
+ }
290
+
291
+ if (p0 < 0) {
292
+ p0 = 0;
293
+ }
294
+
295
+ if (p1 < 0) {
296
+ p1 = std::numeric_limits<llama_pos>::max();
297
+ }
298
+
299
+ // If there is no range then return early to avoid looping over the cache.
300
+ if (p0 == p1) {
301
+ return;
302
+ }
303
+
304
+ for (uint32_t i = 0; i < size; ++i) {
305
+ if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
306
+ has_shift = true;
307
+
308
+ {
309
+ llama_pos p_old = cells[i].pos;
310
+ cells[i].pos /= d;
311
+ cells[i].delta += cells[i].pos - p_old;
312
+ }
313
+ }
314
+ }
315
+ }
316
+
317
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
318
+ llama_pos result = 0;
319
+
320
+ for (uint32_t i = 0; i < size; ++i) {
321
+ if (cells[i].has_seq_id(seq_id)) {
322
+ result = std::max(result, cells[i].pos);
323
+ }
324
+ }
325
+
326
+ return result;
327
+ }
328
+
329
+ void llama_kv_cache_unified::restore() {
330
+ if (pending.ranges.empty()) {
331
+ return;
332
+ }
333
+
334
+ uint32_t new_head = size;
335
+
336
+ for (auto & range : pending.ranges) {
337
+ for (uint32_t i = range.c0; i < range.c1; ++i) {
338
+ cells[i].seq_id.clear();
339
+
340
+ // keep count of the number of used cells
341
+ if (cells[i].pos >= 0) {
342
+ used--;
343
+ }
344
+
345
+ cells[i].pos = -1;
346
+ }
347
+
348
+ new_head = std::min(new_head, range.c0);
349
+ }
350
+
351
+ if (new_head != size && new_head < head) {
352
+ head = new_head;
353
+ }
354
+ }
355
+
356
+ void llama_kv_cache_unified::commit() {
357
+ if (pending.ranges.empty()) {
358
+ LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
359
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
360
+ return;
361
+ }
362
+
363
+ pending.ranges.clear();
364
+ }
365
+
366
+ bool llama_kv_cache_unified::update(llama_context & lctx) {
367
+ bool need_reserve = false;
368
+
369
+ auto * sched = lctx.get_sched();
370
+
371
+ if (has_shift) {
372
+ if (!get_can_shift()) {
373
+ GGML_ABORT("The current KV cache / model configuration does not support K-shift");
374
+ }
375
+
376
+ LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
377
+
378
+ // apply K-shift if needed
379
+ if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
380
+ ggml_backend_sched_reset(sched);
381
+
382
+ auto * gf = lctx.graph_init();
383
+
384
+ auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
385
+
386
+ ggml_backend_sched_alloc_graph(sched, gf);
387
+
388
+ res->set_inputs(nullptr);
389
+
390
+ lctx.graph_compute(gf, false);
391
+
392
+ need_reserve = true;
393
+ }
394
+
395
+ {
396
+ has_shift = false;
397
+
398
+ for (uint32_t i = 0; i < size; ++i) {
399
+ cells[i].delta = 0;
400
+ }
401
+ }
402
+ }
403
+
404
+ if (do_defrag) {
405
+ LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
406
+
407
+ if (defrag_prepare(lctx.graph_max_nodes())) {
408
+ ggml_backend_sched_reset(sched);
409
+
410
+ auto * gf = lctx.graph_init();
411
+
412
+ auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
413
+
414
+ ggml_backend_sched_alloc_graph(sched, gf);
415
+
416
+ res->set_inputs(nullptr);
417
+
418
+ lctx.graph_compute(gf, false);
419
+
420
+ need_reserve = true;
421
+ }
422
+
423
+ do_defrag = false;
424
+ }
425
+
426
+ return need_reserve;
427
+ }
428
+
429
+ void llama_kv_cache_unified::defrag_sched(float thold) {
430
+ // - do not defrag small contexts (i.e. < 2048 tokens)
431
+ // - count the padding towards the number of used tokens
432
+ const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
433
+
434
+ // queue defragmentation for next llama_kv_cache_update
435
+ if (fragmentation > thold) {
436
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
437
+
438
+ do_defrag = true;
439
+ }
440
+ }
441
+
442
+ void llama_kv_cache_unified::set_full() {
443
+ n = size;
444
+ }
445
+
446
+ llama_sbatch llama_kv_cache_unified::sbatch_init(
447
+ const llama_batch & batch,
448
+ bool logits_all) {
449
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
450
+ }
451
+
452
+ llama_ubatch llama_kv_cache_unified::ubatch_next(
453
+ llama_sbatch & sbatch,
454
+ uint32_t n_ubatch,
455
+ bool embd_pooled) const {
456
+ GGML_UNUSED(embd_pooled);
457
+ return sbatch.split_simple(n_ubatch);
458
+ }
459
+
460
+ bool llama_kv_cache_unified::find_slot(
461
+ const llama_ubatch & ubatch) {
462
+ const uint32_t n_tokens = ubatch.n_tokens;
463
+ const uint32_t n_seqs = ubatch.n_seqs;
464
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
465
+
466
+ // if we have enough unused cells before the current head ->
467
+ // better to start searching from the beginning of the cache, hoping to fill it
468
+ if (head > used + 2*ubatch.n_tokens) {
469
+ head = 0;
470
+ }
471
+
472
+ // otherwise, one cell per token.
473
+
474
+ if (n_tokens > size) {
475
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
476
+ return false;
477
+ }
478
+
479
+ uint32_t n_tested = 0;
480
+
481
+ while (true) {
482
+ if (head + n_tokens > size) {
483
+ n_tested += size - head;
484
+ head = 0;
485
+ continue;
486
+ }
487
+
488
+ bool found = true;
489
+ for (uint32_t i = 0; i < n_tokens; i++) {
490
+ if (cells[head + i].pos >= 0) {
491
+ found = false;
492
+ head += i + 1;
493
+ n_tested += i + 1;
494
+ break;
495
+ }
496
+ }
497
+
498
+ if (found) {
499
+ break;
500
+ }
501
+
502
+ if (n_tested >= size) {
503
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
504
+ return false;
505
+ }
506
+ }
507
+
508
+ for (uint32_t s = 0; s < n_seqs; s++) {
509
+ for (uint32_t i = 0; i < n_seq_tokens; ++i) {
510
+ uint32_t k = s*n_seq_tokens + i;
511
+ cells[head + k].pos = ubatch.pos[k];
512
+
513
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
514
+ cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
515
+ }
516
+ }
517
+ }
518
+
519
+ used += n_tokens;
520
+
521
+ pending.ranges.push_back({head, head + n_tokens});
522
+
523
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
524
+ // after enough generations, the benefit from this heuristic disappears
525
+ // if we start defragmenting the cache, the benefit from this will be more important
526
+ n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
527
+
528
+ //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
529
+
530
+ return true;
531
+ }
532
+
533
+ int32_t llama_kv_cache_unified::get_n_tokens() const {
534
+ int32_t result = 0;
535
+
536
+ for (uint32_t i = 0; i < size; i++) {
537
+ result += cells[i].seq_id.size();
538
+ }
539
+
540
+ return result;
541
+ }
542
+
543
+ int32_t llama_kv_cache_unified::get_used_cells() const {
544
+ return used;
545
+ }
546
+
547
+ bool llama_kv_cache_unified::get_can_shift() const {
548
+ return can_shift;
549
+ }
550
+
551
+ llama_pos llama_kv_cache_unified::get_pos_max() const {
552
+ llama_pos pos_max = -1;
553
+ for (const auto & cell : cells) {
554
+ pos_max = std::max(pos_max, cell.pos);
555
+ }
556
+
557
+ return pos_max;
558
+ }
559
+
560
+ size_t llama_kv_cache_unified::total_size() const {
561
+ size_t size = 0;
562
+ for (const auto & buf : bufs) {
563
+ size += ggml_backend_buffer_get_size(buf.get());
564
+ }
565
+
566
+ return size;
567
+ }
568
+
569
+ size_t llama_kv_cache_unified::size_k_bytes() const {
570
+ size_t size_k_bytes = 0;
571
+
572
+ for (const auto & k : k_l) {
573
+ size_k_bytes += ggml_nbytes(k);
574
+ }
575
+
576
+ return size_k_bytes;
577
+ }
578
+
579
+ size_t llama_kv_cache_unified::size_v_bytes() const {
580
+ size_t size_v_bytes = 0;
581
+
582
+ for (const auto & v : v_l) {
583
+ size_v_bytes += ggml_nbytes(v);
584
+ }
585
+
586
+ return size_v_bytes;
587
+ }
588
+
589
+ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
590
+ const llama_cparams & cparams,
591
+ ggml_context * ctx,
592
+ ggml_tensor * cur,
593
+ ggml_tensor * shift,
594
+ ggml_tensor * factors,
595
+ float freq_base,
596
+ float freq_scale) const {
597
+ const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
598
+
599
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
600
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
601
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
602
+
603
+ const auto & n_rot = hparams.n_rot;
604
+ const auto & rope_type = hparams.rope_type;
605
+
606
+ // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
607
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
608
+ const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
609
+
610
+ ggml_tensor * tmp;
611
+
612
+ if (ggml_is_quantized(cur->type)) {
613
+ // dequantize to f32 -> RoPE -> quantize back
614
+ tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
615
+
616
+ tmp = ggml_rope_ext(ctx, tmp,
617
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
618
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
619
+
620
+ tmp = ggml_cpy(ctx, tmp, cur);
621
+ } else {
622
+ // we rotate only the first n_rot dimensions
623
+ tmp = ggml_rope_ext_inplace(ctx, cur,
624
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
625
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
626
+ }
627
+
628
+ return tmp;
629
+ }
630
+
631
+ class llm_graph_input_k_shift : public llm_graph_input_i {
632
+ public:
633
+ llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
634
+ virtual ~llm_graph_input_k_shift() = default;
635
+
636
+ void set_input(const llama_ubatch * ubatch) override;
637
+
638
+ ggml_tensor * k_shift; // I32 [kv_size]
639
+
640
+ const llama_kv_cache_unified * kv_self;
641
+ };
642
+
643
+ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
644
+ GGML_UNUSED(ubatch);
645
+
646
+ if (k_shift) {
647
+ assert(ggml_backend_buffer_is_host(k_shift->buffer));
648
+
649
+ int32_t * data = (int32_t *) k_shift->data;
650
+
651
+ for (uint32_t i = 0; i < kv_self->size; ++i) {
652
+ data[i] = kv_self->cells[i].delta;
653
+ }
654
+ }
655
+ }
656
+
657
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
658
+ const llama_cparams & cparams,
659
+ ggml_context * ctx,
660
+ ggml_cgraph * gf) const {
661
+ auto res = std::make_unique<llm_graph_result>();
662
+
663
+ const auto & n_layer = hparams.n_layer;
664
+
665
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
666
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
667
+
668
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
669
+
670
+ //GGML_ASSERT(kv_self->size == n_ctx);
671
+
672
+ auto inp = std::make_unique<llm_graph_input_k_shift>(this);
673
+
674
+ inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
675
+ ggml_set_input(inp->k_shift);
676
+
677
+ for (uint32_t il = 0; il < n_layer; ++il) {
678
+ const int64_t n_head_kv = hparams.n_head_kv(il);
679
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
680
+
681
+ const bool is_swa = hparams.is_swa(il);
682
+
683
+ // note: the swa rope params could become part of the cparams in the future
684
+ // if we decide to make them configurable, like the non-sliding ones
685
+ const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
686
+ const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
687
+
688
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
689
+
690
+ ggml_tensor * k =
691
+ ggml_view_3d(ctx, k_l[il],
692
+ n_embd_head_k, n_head_kv, size,
693
+ ggml_row_size(k_l[il]->type, n_embd_head_k),
694
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa),
695
+ 0);
696
+
697
+ ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
698
+
699
+ ggml_build_forward_expand(gf, cur);
700
+ }
701
+
702
+ res->add_input(std::move(inp));
703
+
704
+ return res;
705
+ }
706
+
707
+ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
708
+ const llama_cparams & cparams,
709
+ ggml_context * ctx,
710
+ ggml_cgraph * gf) const {
711
+ auto res = std::make_unique<llm_graph_result>();
712
+
713
+ const auto & ids = defrag_info.ids;
714
+
715
+ #if 0
716
+ // CPU defrag
717
+ //
718
+ // TODO: optimizations are possible:
719
+ // - multiple threads
720
+ // - avoid copying to the host memory when already there
721
+ //
722
+ // likely not worth the effort, as we have ggml_graph based defrag
723
+ //
724
+
725
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
726
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
727
+
728
+ const uint32_t kv_size = size;
729
+
730
+ std::vector<uint8_t> buf_k;
731
+ std::vector<uint8_t> buf_v;
732
+
733
+ for (uint32_t il = 0; il < n_layer; ++il) {
734
+ const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
735
+ const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
736
+
737
+ const size_t v_size_el = ggml_type_size(v_l[il]->type);
738
+ const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
739
+
740
+ buf_k.resize(k_size);
741
+ buf_v.resize(v_size);
742
+
743
+ ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
744
+ ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
745
+
746
+ // batch move [i, i+nm) to [id, id+nm)
747
+ // note: cells can move only to a lower index
748
+ for (uint32_t i = 0; i < n_kv; ++i) {
749
+ const uint32_t id = ids[i];
750
+
751
+ if (i == id || id == n_kv) {
752
+ continue;
753
+ }
754
+
755
+ uint32_t nm = 1;
756
+
757
+ while (i + nm < n_kv && ids[i + nm] == id + nm) {
758
+ nm++;
759
+ }
760
+
761
+ // move keys
762
+ {
763
+ const int64_t os = i*k_size_row;
764
+ const int64_t od = id*k_size_row;
765
+
766
+ memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
767
+ }
768
+
769
+ // move values (note: they are transposed)
770
+ {
771
+ const int64_t os = i;
772
+ const int64_t od = id;
773
+
774
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
775
+ memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
776
+ }
777
+ }
778
+
779
+ i += nm - 1;
780
+ }
781
+
782
+ ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
783
+ ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
784
+ }
785
+ #else
786
+ for (uint32_t i = 0; i < ids.size(); ++i) {
787
+ const uint32_t id = ids[i];
788
+
789
+ if (i == id || id == ids.size()) {
790
+ continue;
791
+ }
792
+
793
+ uint32_t nm = 1;
794
+
795
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
796
+ nm++;
797
+ }
798
+
799
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
800
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
801
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
802
+
803
+ ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
804
+ n_embd_k_gqa, nm,
805
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa),
806
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
807
+
808
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
809
+ n_embd_k_gqa, nm,
810
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa),
811
+ ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
812
+
813
+ ggml_tensor * view_v_src;
814
+ ggml_tensor * view_v_dst;
815
+
816
+ if (cparams.flash_attn) {
817
+ // NOTE: the V cache is not transposed when using flash attention
818
+ view_v_src = ggml_view_2d(ctx, v_l[il],
819
+ n_embd_v_gqa, nm,
820
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa),
821
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
822
+
823
+ view_v_dst = ggml_view_2d(ctx, v_l[il],
824
+ n_embd_v_gqa, nm,
825
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa),
826
+ ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
827
+ } else {
828
+ view_v_src = ggml_view_2d(ctx, v_l[il],
829
+ nm, n_embd_v_gqa,
830
+ ggml_row_size(v_l[il]->type, size),
831
+ ggml_row_size(v_l[il]->type, i));
832
+
833
+ view_v_dst = ggml_view_2d(ctx, v_l[il],
834
+ nm, n_embd_v_gqa,
835
+ ggml_row_size(v_l[il]->type, size),
836
+ ggml_row_size(v_l[il]->type, id));
837
+ }
838
+
839
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
840
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
841
+ }
842
+
843
+ i += nm - 1;
844
+ }
845
+
846
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
847
+ #endif
848
+
849
+ return res;
850
+ }
851
+
852
+ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
853
+ const uint32_t n_layer = hparams.n_layer;
854
+
855
+ const uint32_t n_kv = cell_max();
856
+ const uint32_t n_used = used;
857
+
858
+ assert(n_used <= n_kv);
859
+
860
+ //const int64_t t_start = ggml_time_us();
861
+
862
+ // number of cells moved
863
+ uint32_t n_moves = 0;
864
+
865
+ // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
866
+ // - source view, destination view, copy operation
867
+ // - x2 for keys and values
868
+ //const uint32_t max_moves = max_nodes()/(6*n_layer);
869
+ // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
870
+ const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
871
+
872
+ // determine which KV cells to move where
873
+ //
874
+ // cell i moves to ids[i]
875
+ //
876
+ // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
877
+ //
878
+ auto & ids = defrag_info.ids;
879
+
880
+ ids.clear();
881
+ ids.resize(n_kv, n_kv);
882
+
883
+ for (uint32_t i0 = 0; i0 < n_used; ++i0) {
884
+ const auto & cell0 = cells[i0];
885
+
886
+ if (!cell0.is_empty()) {
887
+ ids[i0] = i0;
888
+
889
+ continue;
890
+ }
891
+
892
+ // found a hole - fill it with data from the end of the cache
893
+
894
+ uint32_t nh = 1;
895
+
896
+ // determine the size of the hole
897
+ while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
898
+ nh++;
899
+ }
900
+
901
+ uint32_t nf = 0;
902
+ uint32_t is = n_kv - 1;
903
+
904
+ // starting from the end, find nh non-empty cells
905
+ for (; is > i0; --is) {
906
+ const auto & cell1 = cells[is];
907
+
908
+ if (cell1.is_empty() || ids[is] != n_kv) {
909
+ continue;
910
+ }
911
+
912
+ // non-empty cell which is not yet moved
913
+ nf++;
914
+
915
+ if (nf == nh) {
916
+ break;
917
+ }
918
+ }
919
+
920
+ // this can only happen if `n_used` is not accurate, which would be a bug
921
+ GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
922
+
923
+ nf = 0;
924
+
925
+ uint32_t i1 = is;
926
+
927
+ // are we moving a continuous block of memory?
928
+ bool cont = false;
929
+
930
+ // should we stop searching for the next move?
931
+ bool stop = false;
932
+
933
+ // go back and move the nf cells to the hole
934
+ for (; i1 < n_kv; ++i1) {
935
+ auto & cell1 = cells[i1];
936
+
937
+ if (cell1.is_empty() || ids[i1] != n_kv) {
938
+ if (n_moves == max_moves) {
939
+ stop = true;
940
+ break;
941
+ }
942
+
943
+ cont = false;
944
+ continue;
945
+ }
946
+
947
+ // this cell goes to (i0 + nf)
948
+ ids[i1] = i0 + nf;
949
+
950
+ // move the cell meta data
951
+ cells[i0 + nf] = cell1;
952
+
953
+ // clear the old cell and move the head there
954
+ cell1 = kv_cell();
955
+ head = n_used;
956
+
957
+ if (!cont) {
958
+ n_moves++;
959
+ cont = true;
960
+ }
961
+
962
+ nf++;
963
+
964
+ if (nf == nh) {
965
+ break;
966
+ }
967
+ }
968
+
969
+ if (stop || n_moves == max_moves) {
970
+ break;
971
+ }
972
+
973
+ //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
974
+
975
+ i0 += nh - 1;
976
+ }
977
+
978
+ if (n_moves == 0) {
979
+ return false;
980
+ }
981
+
982
+ LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
983
+
984
+ LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
985
+
986
+ return true;
987
+ }
988
+
989
+ uint32_t llama_kv_cache_unified::cell_max() const {
990
+ for (uint32_t i = size; i > 0; --i) {
991
+ const kv_cell & cell = cells[i - 1];
992
+
993
+ if (cell.pos >= 0 && !cell.is_empty()) {
994
+ return i;
995
+ }
996
+ }
997
+
998
+ return 0;
999
+ }
1000
+
1001
+ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1002
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
1003
+ uint32_t cell_count = 0;
1004
+
1005
+ // Count the number of cells with the specified seq_id
1006
+ // Find all the ranges of cells with this seq id (or all, when -1)
1007
+ uint32_t cell_range_begin = size;
1008
+ for (uint32_t i = 0; i < size; ++i) {
1009
+ const auto & cell = cells[i];
1010
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
1011
+ ++cell_count;
1012
+ if (cell_range_begin == size) {
1013
+ cell_range_begin = i;
1014
+ }
1015
+ } else {
1016
+ if (cell_range_begin != size) {
1017
+ cell_ranges.emplace_back(cell_range_begin, i);
1018
+ cell_range_begin = size;
1019
+ }
1020
+ }
1021
+ }
1022
+ if (cell_range_begin != size) {
1023
+ cell_ranges.emplace_back(cell_range_begin, size);
1024
+ }
1025
+
1026
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
1027
+ uint32_t cell_count_check = 0;
1028
+ for (const auto & range : cell_ranges) {
1029
+ cell_count_check += range.second - range.first;
1030
+ }
1031
+ GGML_ASSERT(cell_count == cell_count_check);
1032
+
1033
+ io.write(&cell_count, sizeof(cell_count));
1034
+
1035
+ state_write_meta(io, cell_ranges, seq_id);
1036
+ state_write_data(io, cell_ranges);
1037
+ }
1038
+
1039
+ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1040
+ uint32_t cell_count;
1041
+ io.read_to(&cell_count, sizeof(cell_count));
1042
+
1043
+ bool res = true;
1044
+ res = res && state_read_meta(io, cell_count, seq_id);
1045
+ res = res && state_read_data(io, cell_count);
1046
+
1047
+ if (!res) {
1048
+ if (seq_id == -1) {
1049
+ clear();
1050
+ } else {
1051
+ seq_rm(seq_id, -1, -1);
1052
+ }
1053
+ throw std::runtime_error("failed to restore kv cache");
1054
+ }
1055
+ }
1056
+
1057
+ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
1058
+ for (const auto & range : cell_ranges) {
1059
+ for (uint32_t i = range.first; i < range.second; ++i) {
1060
+ const auto & cell = cells[i];
1061
+ const llama_pos pos = cell.pos;
1062
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
1063
+
1064
+ io.write(&pos, sizeof(pos));
1065
+ io.write(&n_seq_id, sizeof(n_seq_id));
1066
+
1067
+ if (n_seq_id) {
1068
+ for (auto seq_id : cell.seq_id) {
1069
+ io.write(&seq_id, sizeof(seq_id));
1070
+ }
1071
+ }
1072
+ }
1073
+ }
1074
+ }
1075
+
1076
+ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
1077
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
1078
+ const uint32_t n_layer = hparams.n_layer;
1079
+
1080
+ io.write(&v_trans, sizeof(v_trans));
1081
+ io.write(&n_layer, sizeof(n_layer));
1082
+
1083
+ std::vector<uint8_t> tmp_buf;
1084
+
1085
+ // Iterate and write all the keys first, each row is a cell
1086
+ // Get whole range at a time
1087
+ for (uint32_t il = 0; il < n_layer; ++il) {
1088
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1089
+
1090
+ // Write key type
1091
+ const int32_t k_type_i = (int32_t)k_l[il]->type;
1092
+ io.write(&k_type_i, sizeof(k_type_i));
1093
+
1094
+ // Write row size of key
1095
+ const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1096
+ io.write(&k_size_row, sizeof(k_size_row));
1097
+
1098
+ // Read each range of cells of k_size length each into tmp_buf and write out
1099
+ for (const auto & range : cell_ranges) {
1100
+ const size_t range_size = range.second - range.first;
1101
+ const size_t buf_size = range_size * k_size_row;
1102
+ io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
1103
+ }
1104
+ }
1105
+
1106
+ if (!v_trans) {
1107
+ for (uint32_t il = 0; il < n_layer; ++il) {
1108
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1109
+
1110
+ // Write value type
1111
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1112
+ io.write(&v_type_i, sizeof(v_type_i));
1113
+
1114
+ // Write row size of value
1115
+ const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1116
+ io.write(&v_size_row, sizeof(v_size_row));
1117
+
1118
+ // Read each range of cells of v_size length each into tmp_buf and write out
1119
+ for (const auto & range : cell_ranges) {
1120
+ const size_t range_size = range.second - range.first;
1121
+ const size_t buf_size = range_size * v_size_row;
1122
+ io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
1123
+ }
1124
+ }
1125
+ } else {
1126
+ // When v is transposed, we also need the element size and get the element ranges from each row
1127
+ const uint32_t kv_size = size;
1128
+ for (uint32_t il = 0; il < n_layer; ++il) {
1129
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1130
+
1131
+ // Write value type
1132
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1133
+ io.write(&v_type_i, sizeof(v_type_i));
1134
+
1135
+ // Write element size
1136
+ const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
1137
+ io.write(&v_size_el, sizeof(v_size_el));
1138
+
1139
+ // Write GQA embedding size
1140
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1141
+
1142
+ // For each row, we get the element values of each cell
1143
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1144
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1145
+ for (const auto & range : cell_ranges) {
1146
+ const size_t range_size = range.second - range.first;
1147
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1148
+ const size_t buf_size = range_size * v_size_el;
1149
+ io.write_tensor(v_l[il], src_offset, buf_size);
1150
+ }
1151
+ }
1152
+ }
1153
+ }
1154
+ }
1155
+
1156
+ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
1157
+ if (dest_seq_id != -1) {
1158
+ // single sequence
1159
+
1160
+ seq_rm(dest_seq_id, -1, -1);
1161
+
1162
+ llama_sbatch sbatch;
1163
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1164
+
1165
+ batch.n_tokens = cell_count;
1166
+ batch.n_seq_tokens = cell_count;
1167
+ batch.n_seqs = 1;
1168
+
1169
+ for (uint32_t i = 0; i < cell_count; ++i) {
1170
+ llama_pos pos;
1171
+ uint32_t n_seq_id;
1172
+
1173
+ io.read_to(&pos, sizeof(pos));
1174
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1175
+
1176
+ if (n_seq_id != 0) {
1177
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1178
+ return false;
1179
+ }
1180
+
1181
+ batch.pos[i] = pos;
1182
+ }
1183
+ batch.n_seq_id[0] = 1;
1184
+ batch.seq_id[0] = &dest_seq_id;
1185
+ if (!find_slot(batch)) {
1186
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1187
+ return false;
1188
+ }
1189
+ commit();
1190
+
1191
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
1192
+ // Assume that this is one contiguous block of cells
1193
+ GGML_ASSERT(head + cell_count <= size);
1194
+ GGML_ASSERT(cells[head].pos == batch.pos[0]);
1195
+ GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
1196
+ GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
1197
+ GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
1198
+ } else {
1199
+ // whole KV cache restore
1200
+
1201
+ if (cell_count > size) {
1202
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1203
+ return false;
1204
+ }
1205
+
1206
+ clear();
1207
+
1208
+ for (uint32_t i = 0; i < cell_count; ++i) {
1209
+ kv_cell & cell = cells[i];
1210
+
1211
+ llama_pos pos;
1212
+ uint32_t n_seq_id;
1213
+
1214
+ io.read_to(&pos, sizeof(pos));
1215
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1216
+
1217
+ cell.pos = pos;
1218
+
1219
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1220
+ llama_seq_id seq_id;
1221
+ io.read_to(&seq_id, sizeof(seq_id));
1222
+
1223
+ // TODO: llama_kv_cache_unified should have a notion of max sequences
1224
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1225
+ if (seq_id < 0) {
1226
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1227
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
1228
+ return false;
1229
+ }
1230
+
1231
+ cell.seq_id.insert(seq_id);
1232
+ }
1233
+ }
1234
+
1235
+ head = 0;
1236
+ used = cell_count;
1237
+ }
1238
+
1239
+ return true;
1240
+ }
1241
+
1242
+ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1243
+ uint32_t v_trans;
1244
+ uint32_t n_layer;
1245
+ io.read_to(&v_trans, sizeof(v_trans));
1246
+ io.read_to(&n_layer, sizeof(n_layer));
1247
+
1248
+ if (n_layer != hparams.n_layer) {
1249
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
1250
+ return false;
1251
+ }
1252
+ if (cell_count > size) {
1253
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1254
+ return false;
1255
+ }
1256
+ if (this->v_trans != (bool) v_trans) {
1257
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1258
+ return false;
1259
+ }
1260
+
1261
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1262
+ for (uint32_t il = 0; il < n_layer; ++il) {
1263
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1264
+
1265
+ // Read type of key
1266
+ int32_t k_type_i_ref;
1267
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1268
+ const int32_t k_type_i = (int32_t) k_l[il]->type;
1269
+ if (k_type_i != k_type_i_ref) {
1270
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1271
+ return false;
1272
+ }
1273
+
1274
+ // Read row size of key
1275
+ uint64_t k_size_row_ref;
1276
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1277
+ const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1278
+ if (k_size_row != k_size_row_ref) {
1279
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1280
+ return false;
1281
+ }
1282
+
1283
+ if (cell_count) {
1284
+ // Read and set the keys for the whole cell range
1285
+ ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1286
+ }
1287
+ }
1288
+
1289
+ if (!this->v_trans) {
1290
+ for (uint32_t il = 0; il < n_layer; ++il) {
1291
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1292
+
1293
+ // Read type of value
1294
+ int32_t v_type_i_ref;
1295
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1296
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1297
+ if (v_type_i != v_type_i_ref) {
1298
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1299
+ return false;
1300
+ }
1301
+
1302
+ // Read row size of value
1303
+ uint64_t v_size_row_ref;
1304
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1305
+ const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1306
+ if (v_size_row != v_size_row_ref) {
1307
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1308
+ return false;
1309
+ }
1310
+
1311
+ if (cell_count) {
1312
+ // Read and set the values for the whole cell range
1313
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1314
+ }
1315
+ }
1316
+ } else {
1317
+ // For each layer, read the values for each cell (transposed)
1318
+ for (uint32_t il = 0; il < n_layer; ++il) {
1319
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1320
+
1321
+ // Read type of value
1322
+ int32_t v_type_i_ref;
1323
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1324
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
1325
+ if (v_type_i != v_type_i_ref) {
1326
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1327
+ return false;
1328
+ }
1329
+
1330
+ // Read element size of value
1331
+ uint32_t v_size_el_ref;
1332
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1333
+ const size_t v_size_el = ggml_type_size(v_l[il]->type);
1334
+ if (v_size_el != v_size_el_ref) {
1335
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1336
+ return false;
1337
+ }
1338
+
1339
+ // Read GQA embedding size
1340
+ uint32_t n_embd_v_gqa_ref;
1341
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1342
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1343
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1344
+ return false;
1345
+ }
1346
+
1347
+ if (cell_count) {
1348
+ // For each row in the transposed matrix, read the values for the whole cell range
1349
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1350
+ const size_t dst_offset = (head + j * size) * v_size_el;
1351
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1352
+ }
1353
+ }
1354
+ }
1355
+ }
1356
+
1357
+ return true;
1358
+ }
1359
+
1360
+ //
1361
+ // llama_kv_cache_recurrent
1362
+ //
1363
+
1364
+ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
1365
+ const llama_model & model,
1366
+ ggml_type type_k,
1367
+ ggml_type type_v,
1368
+ bool offload,
1369
+ uint32_t kv_size) : hparams(model.hparams) {
1370
+ const int32_t n_layer = hparams.n_layer;
1371
+
1372
+ LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
1373
+ __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
1374
+
1375
+ head = 0;
1376
+ size = kv_size;
1377
+ used = 0;
1378
+
1379
+ this->type_k = type_k;
1380
+ this->type_v = type_v;
1381
+
1382
+ cells.clear();
1383
+ cells.resize(kv_size);
1384
+
1385
+ // create a context for each buffer type
1386
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1387
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
1388
+ auto it = ctx_map.find(buft);
1389
+ if (it == ctx_map.end()) {
1390
+ ggml_init_params params = {
1391
+ /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
1392
+ /*.mem_buffer =*/ NULL,
1393
+ /*.no_alloc =*/ true,
1394
+ };
1395
+
1396
+ ggml_context * ctx = ggml_init(params);
1397
+ if (!ctx) {
1398
+ return nullptr;
1399
+ }
1400
+
1401
+ ctx_map[buft] = ctx;
1402
+ ctxs.emplace_back(ctx);
1403
+
1404
+ return ctx;
1405
+ }
1406
+
1407
+ return it->second;
1408
+ };
1409
+
1410
+ k_l.reserve(n_layer);
1411
+ v_l.reserve(n_layer);
1412
+
1413
+ for (int i = 0; i < n_layer; i++) {
1414
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
1415
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
1416
+
1417
+ const char * dev_name = "CPU";
1418
+
1419
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
1420
+
1421
+ if (offload) {
1422
+ auto * dev = model.dev_layer(i);
1423
+ buft = ggml_backend_dev_buffer_type(dev);
1424
+
1425
+ dev_name = ggml_backend_dev_name(dev);
1426
+ }
1427
+
1428
+ LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
1429
+
1430
+ ggml_context * ctx = ctx_for_buft(buft);
1431
+ if (!ctx) {
1432
+ throw std::runtime_error("failed to create ggml context for kv cache");
1433
+ }
1434
+
1435
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
1436
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
1437
+ ggml_format_name(k, "cache_k_l%d", i);
1438
+ ggml_format_name(v, "cache_v_l%d", i);
1439
+ k_l.push_back(k);
1440
+ v_l.push_back(v);
1441
+ }
1442
+
1443
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
1444
+ for (auto it : ctx_map) {
1445
+ auto * buft = it.first;
1446
+ auto * ctx = it.second;
1447
+
1448
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1449
+ if (!buf) {
1450
+ throw std::runtime_error("failed to allocate buffer for kv cache");
1451
+ }
1452
+ ggml_backend_buffer_clear(buf, 0);
1453
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
1454
+ bufs.emplace_back(buf);
1455
+ }
1456
+
1457
+ {
1458
+ const size_t memory_size_k = size_k_bytes();
1459
+ const size_t memory_size_v = size_v_bytes();
1460
+
1461
+ LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
1462
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
1463
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
1464
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
1465
+ }
1466
+ }
1467
+
1468
+ void llama_kv_cache_recurrent::clear() {
1469
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
1470
+ cells[i].pos = -1;
1471
+ cells[i].seq_id.clear();
1472
+ cells[i].src = -1;
1473
+ cells[i].tail = -1;
1474
+ }
1475
+ head = 0;
1476
+ used = 0;
1477
+
1478
+ for (auto & buf : bufs) {
1479
+ ggml_backend_buffer_clear(buf.get(), 0);
1480
+ }
1481
+ }
1482
+
1483
+ bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
1484
+ uint32_t new_head = size;
1485
+
1486
+ if (p0 < 0) {
1487
+ p0 = 0;
1488
+ }
1489
+
1490
+ if (p1 < 0) {
1491
+ p1 = std::numeric_limits<llama_pos>::max();
1492
+ }
1493
+
1494
+ // models like Mamba or RWKV can't have a state partially erased
1495
+ if (seq_id >= (int64_t) size) {
1496
+ // could be fatal
1497
+ return false;
1498
+ }
1499
+ if (0 <= seq_id) {
1500
+ int32_t & tail_id = cells[seq_id].tail;
1501
+ if (tail_id >= 0) {
1502
+ const kv_cell & cell = cells[tail_id];
1503
+ // partial intersection is invalid
1504
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
1505
+ return false;
1506
+ }
1507
+ // invalidate tails which will be cleared
1508
+ if (p0 <= cell.pos && cell.pos < p1) {
1509
+ tail_id = -1;
1510
+ }
1511
+ }
1512
+ } else {
1513
+ // seq_id is negative, then the range should include everything or nothing
1514
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
1515
+ return false;
1516
+ }
1517
+ }
1518
+
1519
+ for (uint32_t i = 0; i < size; ++i) {
1520
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
1521
+ if (seq_id < 0) {
1522
+ cells[i].seq_id.clear();
1523
+ } else if (cells[i].has_seq_id(seq_id)) {
1524
+ cells[i].seq_id.erase(seq_id);
1525
+ } else {
1526
+ continue;
1527
+ }
1528
+ if (cells[i].is_empty()) {
1529
+ // keep count of the number of used cells
1530
+ if (cells[i].pos >= 0) {
1531
+ used--;
1532
+ }
1533
+ cells[i].pos = -1;
1534
+ cells[i].src = -1;
1535
+ if (new_head == size) {
1536
+ new_head = i;
1537
+ }
1538
+ }
1539
+ }
1540
+ }
1541
+
1542
+ // If we freed up a slot, set head to it so searching can start there.
1543
+ if (new_head != size && new_head < head) {
1544
+ head = new_head;
1545
+ }
1546
+
1547
+ return true;
1548
+ }
1549
+
1550
+ void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
1551
+ if (seq_id_src == seq_id_dst) {
1552
+ return;
1553
+ }
1554
+
1555
+ if (p0 < 0) {
1556
+ p0 = 0;
1557
+ }
1558
+
1559
+ if (p1 < 0) {
1560
+ p1 = std::numeric_limits<llama_pos>::max();
1561
+ }
1562
+
1563
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
1564
+ kv_cell & tail_src = cells[seq_id_src];
1565
+ kv_cell & tail_dst = cells[seq_id_dst];
1566
+ if (tail_dst.tail >= 0) {
1567
+ // clear destination seq_id if it wasn't empty
1568
+ kv_cell & cell_dst = cells[tail_dst.tail];
1569
+
1570
+ cell_dst.seq_id.erase(seq_id_dst);
1571
+ tail_dst.tail = -1;
1572
+ if (cell_dst.seq_id.empty()) {
1573
+ cell_dst.pos = -1;
1574
+ cell_dst.src = -1;
1575
+ used -= 1;
1576
+ }
1577
+ }
1578
+ if (tail_src.tail >= 0) {
1579
+ kv_cell & cell_src = cells[tail_src.tail];
1580
+
1581
+ cell_src.seq_id.insert(seq_id_dst);
1582
+ tail_dst.tail = tail_src.tail;
1583
+ }
1584
+ }
1585
+ }
1586
+
1587
+ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
1588
+ uint32_t new_head = size;
1589
+
1590
+ for (uint32_t i = 0; i < size; ++i) {
1591
+ if ((llama_seq_id) i != seq_id) {
1592
+ cells[i].tail = -1;
1593
+ }
1594
+
1595
+ if (!cells[i].has_seq_id(seq_id)) {
1596
+ if (cells[i].pos >= 0) {
1597
+ used--;
1598
+ }
1599
+
1600
+ cells[i].pos = -1;
1601
+ cells[i].src = -1;
1602
+ cells[i].seq_id.clear();
1603
+
1604
+ if (new_head == size){
1605
+ new_head = i;
1606
+ }
1607
+ } else {
1608
+ cells[i].seq_id.clear();
1609
+ cells[i].seq_id.insert(seq_id);
1610
+ }
1611
+ }
1612
+
1613
+ // If we freed up a slot, set head to it so searching can start there.
1614
+ if (new_head != size && new_head < head) {
1615
+ head = new_head;
1616
+ }
1617
+ }
1618
+
1619
+ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
1620
+ if (delta == 0) {
1621
+ return;
1622
+ }
1623
+
1624
+ if (p0 < 0) {
1625
+ p0 = 0;
1626
+ }
1627
+
1628
+ if (p1 < 0) {
1629
+ p1 = std::numeric_limits<llama_pos>::max();
1630
+ }
1631
+
1632
+ // If there is no range then return early to avoid looping over the
1633
+ if (p0 == p1) {
1634
+ return;
1635
+ }
1636
+
1637
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
1638
+ if (0 <= seq_id && seq_id < (int64_t) size) {
1639
+ const int32_t tail_id = cells[seq_id].tail;
1640
+ if (tail_id >= 0) {
1641
+ kv_cell & cell = cells[tail_id];
1642
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
1643
+ cell.pos += delta;
1644
+ }
1645
+ }
1646
+ }
1647
+ }
1648
+
1649
+ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
1650
+ if (d == 1) {
1651
+ return;
1652
+ }
1653
+
1654
+ if (p0 < 0) {
1655
+ p0 = 0;
1656
+ }
1657
+
1658
+ if (p1 < 0) {
1659
+ p1 = std::numeric_limits<llama_pos>::max();
1660
+ }
1661
+
1662
+ // If there is no range then return early to avoid looping over the cache.
1663
+ if (p0 == p1) {
1664
+ return;
1665
+ }
1666
+
1667
+ // for Mamba-like or RWKV models, only the pos needs to be changed
1668
+ if (0 <= seq_id && seq_id < (int64_t) size) {
1669
+ const int32_t tail_id = cells[seq_id].tail;
1670
+ if (tail_id >= 0) {
1671
+ kv_cell & cell = cells[tail_id];
1672
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
1673
+ cell.pos /= d;
1674
+ }
1675
+ }
1676
+ }
1677
+ }
1678
+
1679
+ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
1680
+ llama_pos result = 0;
1681
+
1682
+ for (uint32_t i = 0; i < size; ++i) {
1683
+ if (cells[i].has_seq_id(seq_id)) {
1684
+ result = std::max(result, cells[i].pos);
1685
+ }
1686
+ }
1687
+
1688
+ return result;
1689
+ }
1690
+
1691
+ void llama_kv_cache_recurrent::restore() {
1692
+ if (pending.ranges.empty()) {
1693
+ return;
1694
+ }
1695
+
1696
+ seq_rm(-1, -1, -1);
1697
+ }
1698
+
1699
+ void llama_kv_cache_recurrent::commit() {
1700
+ pending.ranges.clear();
1701
+ }
1702
+
1703
+ bool llama_kv_cache_recurrent::update(llama_context & lctx) {
1704
+ GGML_UNUSED(lctx);
1705
+ return false;
1706
+ }
1707
+
1708
+ void llama_kv_cache_recurrent::defrag_sched(float thold) {
1709
+ GGML_UNUSED(thold);
1710
+ // noop
1711
+ }
1712
+
1713
+ void llama_kv_cache_recurrent::set_full() {
1714
+ n = size;
1715
+ }
1716
+
1717
+ llama_sbatch llama_kv_cache_recurrent::sbatch_init(
1718
+ const llama_batch & batch,
1719
+ bool logits_all) {
1720
+ return llama_sbatch(batch, hparams.n_embd, false, logits_all);
1721
+ }
1722
+
1723
+ llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
1724
+ if (embd_pooled) {
1725
+ // Pooled embeddings cannot be split across ubatches (yet)
1726
+ return sbatch.split_seq(n_ubatch);
1727
+ }
1728
+
1729
+ return sbatch.split_equal(n_ubatch);
1730
+ }
1731
+
1732
+ bool llama_kv_cache_recurrent::find_slot(
1733
+ const llama_ubatch & ubatch) {
1734
+ const uint32_t n_tokens = ubatch.n_tokens;
1735
+ const uint32_t n_seqs = ubatch.n_seqs;
1736
+
1737
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
1738
+
1739
+ // if we have enough unused cells before the current head ->
1740
+ // better to start searching from the beginning of the cache, hoping to fill it
1741
+ if (head > used + 2*n_tokens) {
1742
+ head = 0;
1743
+ }
1744
+
1745
+ // For recurrent state architectures (like Mamba or RWKV),
1746
+ // each cache cell can store the state for a whole sequence.
1747
+ // A slot should be always be contiguous.
1748
+
1749
+ // can only process batches with an equal number of new tokens in each sequence
1750
+ GGML_ASSERT(ubatch.equal_seqs);
1751
+
1752
+ int32_t min = size - 1;
1753
+ int32_t max = 0;
1754
+
1755
+ // everything should fit if all seq_ids are smaller than the max
1756
+ for (uint32_t s = 0; s < n_seqs; ++s) {
1757
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
1758
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1759
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
1760
+
1761
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
1762
+ // too big seq_id
1763
+ // TODO: would it be possible to resize the cache instead?
1764
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
1765
+ return false;
1766
+ }
1767
+ if (j > 0) {
1768
+ kv_cell & seq = cells[seq_id];
1769
+ if (seq.tail >= 0) {
1770
+ kv_cell & cell = cells[seq.tail];
1771
+ // clear cells from seq_ids that become shared
1772
+ // (should not normally happen, but let's handle it anyway)
1773
+ cell.seq_id.erase(seq_id);
1774
+ seq.tail = -1;
1775
+ if (cell.seq_id.empty()) {
1776
+ cell.pos = -1;
1777
+ cell.src = -1;
1778
+ used -= 1;
1779
+ }
1780
+ }
1781
+ }
1782
+ }
1783
+ }
1784
+
1785
+ #ifndef NDEBUG
1786
+ {
1787
+ std::vector<int32_t> tails_verif;
1788
+ tails_verif.assign(size, -1);
1789
+ for (uint32_t i = 0; i < size; ++i) {
1790
+ kv_cell & cell = cells[i];
1791
+ for (llama_seq_id seq_id : cell.seq_id) {
1792
+ if (tails_verif[seq_id] != -1) {
1793
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
1794
+ }
1795
+ tails_verif[seq_id] = i;
1796
+ }
1797
+ }
1798
+ for (uint32_t i = 0; i < size; ++i) {
1799
+ if (tails_verif[i] != cells[i].tail) {
1800
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
1801
+ }
1802
+ }
1803
+ }
1804
+ #endif
1805
+
1806
+ // find next empty cell
1807
+ uint32_t next_empty_cell = head;
1808
+
1809
+ for (uint32_t i = 0; i < size; ++i) {
1810
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
1811
+ kv_cell & cell = cells[next_empty_cell];
1812
+ if (cell.is_empty()) { break; }
1813
+ next_empty_cell += 1;
1814
+ }
1815
+
1816
+ // find usable cell range
1817
+ for (uint32_t s = 0; s < n_seqs; ++s) {
1818
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
1819
+ kv_cell & seq_meta = cells[seq_id];
1820
+ bool has_cell = false;
1821
+ if (seq_meta.tail >= 0) {
1822
+ kv_cell & cell = cells[seq_meta.tail];
1823
+ GGML_ASSERT(cell.has_seq_id(seq_id));
1824
+ // does this seq_id "own" the cell?
1825
+ if (cell.seq_id.size() == 1) { has_cell = true; }
1826
+ }
1827
+ if (!has_cell) {
1828
+ kv_cell & empty_cell = cells[next_empty_cell];
1829
+ GGML_ASSERT(empty_cell.is_empty());
1830
+ // copy old tail into the empty cell
1831
+ if (seq_meta.tail >= 0) {
1832
+ kv_cell & orig_cell = cells[seq_meta.tail];
1833
+ empty_cell.pos = orig_cell.pos;
1834
+ empty_cell.src = orig_cell.src;
1835
+ orig_cell.seq_id.erase(seq_id);
1836
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
1837
+ }
1838
+ seq_meta.tail = next_empty_cell;
1839
+ // find next empty cell
1840
+ if (s + 1 < n_seqs) {
1841
+ next_empty_cell += 1;
1842
+ for (uint32_t i = 0; i < size; ++i) {
1843
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
1844
+ kv_cell & cell = cells[next_empty_cell];
1845
+ if (cell.is_empty()) { break; }
1846
+ next_empty_cell += 1;
1847
+ }
1848
+ }
1849
+ }
1850
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
1851
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
1852
+ }
1853
+
1854
+ // gather and re-order
1855
+ for (uint32_t s = 0; s < n_seqs; ++s) {
1856
+ int32_t dst_id = s + min;
1857
+ int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
1858
+ if (dst_id != src_id) {
1859
+ kv_cell & dst_cell = cells[dst_id];
1860
+ kv_cell & src_cell = cells[src_id];
1861
+
1862
+ std::swap(dst_cell.pos, src_cell.pos);
1863
+ std::swap(dst_cell.src, src_cell.src);
1864
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
1865
+
1866
+ // swap tails (assuming they NEVER overlap)
1867
+ for (const llama_seq_id seq_id : src_cell.seq_id) {
1868
+ cells[seq_id].tail = src_id;
1869
+ }
1870
+ for (const llama_seq_id seq_id : dst_cell.seq_id) {
1871
+ cells[seq_id].tail = dst_id;
1872
+ }
1873
+ }
1874
+ }
1875
+
1876
+ // update the pos of the used seqs
1877
+ for (uint32_t s = 0; s < n_seqs; ++s) {
1878
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
1879
+ int32_t cell_id = s + min;
1880
+ kv_cell & cell = cells[cell_id];
1881
+
1882
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
1883
+ // What should happen when the pos backtracks or skips a value?
1884
+ // Clearing the state mid-batch would require special-casing which isn't done.
1885
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
1886
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
1887
+ }
1888
+ cell.pos = last_pos;
1889
+ cell.seq_id.clear();
1890
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
1891
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
1892
+ cell.seq_id.insert(seq_id);
1893
+ cells[seq_id].tail = cell_id;
1894
+ }
1895
+ }
1896
+
1897
+ // allow getting the range of used cells, from head to head + n
1898
+ head = min;
1899
+ n = max - min + 1;
1900
+ used = std::count_if(cells.begin(), cells.end(),
1901
+ [](const kv_cell & cell){ return !cell.is_empty(); });
1902
+
1903
+ // sanity check
1904
+ return n >= n_seqs;
1905
+ }
1906
+
1907
+ int32_t llama_kv_cache_recurrent::get_n_tokens() const {
1908
+ int32_t result = 0;
1909
+
1910
+ for (uint32_t i = 0; i < size; i++) {
1911
+ result += cells[i].seq_id.size();
1912
+ }
1913
+
1914
+ return result;
1915
+ }
1916
+
1917
+ int32_t llama_kv_cache_recurrent::get_used_cells() const {
1918
+ return used;
1919
+ }
1920
+
1921
+ llama_pos llama_kv_cache_recurrent::get_pos_max() const {
1922
+ llama_pos pos_max = -1;
1923
+ for (const auto & cell : cells) {
1924
+ pos_max = std::max(pos_max, cell.pos);
1925
+ }
1926
+
1927
+ return pos_max;
1928
+ }
1929
+
1930
+ bool llama_kv_cache_recurrent::get_can_shift() const {
1931
+ return false;
1932
+ }
1933
+
1934
+ int32_t llama_kv_cache_recurrent::s_copy(int i) const {
1935
+ const uint32_t cell_id = i + head;
1936
+
1937
+ //////////////////////////////////////////////
1938
+ // TODO: this should not mutate the KV cache !
1939
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
1940
+
1941
+ // prevent out-of-bound sources
1942
+ if (cell.src < 0 || (uint32_t) cell.src >= size) {
1943
+ cell.src = cell_id;
1944
+ }
1945
+
1946
+ int32_t res = cell.src;
1947
+
1948
+ // TODO: do not mutate the KV cache
1949
+ // ensure copy only happens once
1950
+ if (cell.src != (int32_t) cell_id) {
1951
+ cell.src = cell_id;
1952
+ }
1953
+
1954
+ return res;
1955
+ }
1956
+
1957
+ float llama_kv_cache_recurrent::s_mask(int i) const {
1958
+ const uint32_t cell_id = i + head;
1959
+
1960
+ //////////////////////////////////////////////
1961
+ // TODO: this should not mutate the KV cache !
1962
+ kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
1963
+
1964
+ float res = (float) (cell.src >= 0);
1965
+
1966
+ // only clear once
1967
+ if (cell.src < 0) {
1968
+ cell.src = cell_id;
1969
+ }
1970
+
1971
+ return res;
1972
+ }
1973
+
1974
+ uint32_t llama_kv_cache_recurrent::cell_max() const {
1975
+ for (uint32_t i = size; i > 0; --i) {
1976
+ const kv_cell & cell = cells[i - 1];
1977
+
1978
+ if (cell.pos >= 0 && !cell.is_empty()) {
1979
+ return i;
1980
+ }
1981
+ }
1982
+
1983
+ return 0;
1984
+ }
1985
+
1986
+ size_t llama_kv_cache_recurrent::total_size() const {
1987
+ size_t size = 0;
1988
+ for (const auto & buf : bufs) {
1989
+ size += ggml_backend_buffer_get_size(buf.get());
1990
+ }
1991
+
1992
+ return size;
1993
+ }
1994
+
1995
+ size_t llama_kv_cache_recurrent::size_k_bytes() const {
1996
+ size_t size_k_bytes = 0;
1997
+
1998
+ for (const auto & k : k_l) {
1999
+ size_k_bytes += ggml_nbytes(k);
2000
+ }
2001
+
2002
+ return size_k_bytes;
2003
+ }
2004
+
2005
+ size_t llama_kv_cache_recurrent::size_v_bytes() const {
2006
+ size_t size_v_bytes = 0;
2007
+
2008
+ for (const auto & v : v_l) {
2009
+ size_v_bytes += ggml_nbytes(v);
2010
+ }
2011
+
2012
+ return size_v_bytes;
2013
+ }
2014
+
2015
+ void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
2016
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
2017
+ uint32_t cell_count = 0;
2018
+
2019
+ // Count the number of cells with the specified seq_id
2020
+ // Find all the ranges of cells with this seq id (or all, when -1)
2021
+ uint32_t cell_range_begin = size;
2022
+ for (uint32_t i = 0; i < size; ++i) {
2023
+ const auto & cell = cells[i];
2024
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
2025
+ ++cell_count;
2026
+ if (cell_range_begin == size) {
2027
+ cell_range_begin = i;
2028
+ }
2029
+ } else {
2030
+ if (cell_range_begin != size) {
2031
+ cell_ranges.emplace_back(cell_range_begin, i);
2032
+ cell_range_begin = size;
2033
+ }
2034
+ }
2035
+ }
2036
+ if (cell_range_begin != size) {
2037
+ cell_ranges.emplace_back(cell_range_begin, size);
2038
+ }
2039
+
2040
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
2041
+ uint32_t cell_count_check = 0;
2042
+ for (const auto & range : cell_ranges) {
2043
+ cell_count_check += range.second - range.first;
2044
+ }
2045
+ GGML_ASSERT(cell_count == cell_count_check);
2046
+
2047
+ io.write(&cell_count, sizeof(cell_count));
2048
+
2049
+ state_write_meta(io, cell_ranges, seq_id);
2050
+ state_write_data(io, cell_ranges);
2051
+ }
2052
+
2053
+ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
2054
+ uint32_t cell_count;
2055
+ io.read_to(&cell_count, sizeof(cell_count));
2056
+
2057
+ bool res = true;
2058
+ res = res && state_read_meta(io, cell_count, seq_id);
2059
+ res = res && state_read_data(io, cell_count);
2060
+
2061
+ if (!res) {
2062
+ if (seq_id == -1) {
2063
+ clear();
2064
+ } else {
2065
+ seq_rm(seq_id, -1, -1);
2066
+ }
2067
+ throw std::runtime_error("failed to restore kv cache");
2068
+ }
2069
+ }
2070
+
2071
+ void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
2072
+ for (const auto & range : cell_ranges) {
2073
+ for (uint32_t i = range.first; i < range.second; ++i) {
2074
+ const auto & cell = cells[i];
2075
+ const llama_pos pos = cell.pos;
2076
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
2077
+
2078
+ io.write(&pos, sizeof(pos));
2079
+ io.write(&n_seq_id, sizeof(n_seq_id));
2080
+
2081
+ if (n_seq_id) {
2082
+ for (auto seq_id : cell.seq_id) {
2083
+ io.write(&seq_id, sizeof(seq_id));
2084
+ }
2085
+ }
2086
+ }
2087
+ }
2088
+ }
2089
+
2090
+ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
2091
+ const uint32_t v_trans = 0;
2092
+ const uint32_t n_layer = hparams.n_layer;
2093
+
2094
+ io.write(&v_trans, sizeof(v_trans));
2095
+ io.write(&n_layer, sizeof(n_layer));
2096
+
2097
+ std::vector<uint8_t> tmp_buf;
2098
+
2099
+ // Iterate and write all the keys first, each row is a cell
2100
+ // Get whole range at a time
2101
+ for (uint32_t il = 0; il < n_layer; ++il) {
2102
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
2103
+
2104
+ // Write key type
2105
+ const int32_t k_type_i = (int32_t)k_l[il]->type;
2106
+ io.write(&k_type_i, sizeof(k_type_i));
2107
+
2108
+ // Write row size of key
2109
+ const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
2110
+ io.write(&k_size_row, sizeof(k_size_row));
2111
+
2112
+ // Read each range of cells of k_size length each into tmp_buf and write out
2113
+ for (const auto & range : cell_ranges) {
2114
+ const size_t range_size = range.second - range.first;
2115
+ const size_t buf_size = range_size * k_size_row;
2116
+ io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
2117
+ }
2118
+ }
2119
+
2120
+ if (!v_trans) {
2121
+ for (uint32_t il = 0; il < n_layer; ++il) {
2122
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
2123
+
2124
+ // Write value type
2125
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
2126
+ io.write(&v_type_i, sizeof(v_type_i));
2127
+
2128
+ // Write row size of value
2129
+ const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
2130
+ io.write(&v_size_row, sizeof(v_size_row));
2131
+
2132
+ // Read each range of cells of v_size length each into tmp_buf and write out
2133
+ for (const auto & range : cell_ranges) {
2134
+ const size_t range_size = range.second - range.first;
2135
+ const size_t buf_size = range_size * v_size_row;
2136
+ io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
2137
+ }
2138
+ }
2139
+ } else {
2140
+ // When v is transposed, we also need the element size and get the element ranges from each row
2141
+ const uint32_t kv_size = size;
2142
+ for (uint32_t il = 0; il < n_layer; ++il) {
2143
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
2144
+
2145
+ // Write value type
2146
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
2147
+ io.write(&v_type_i, sizeof(v_type_i));
2148
+
2149
+ // Write element size
2150
+ const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
2151
+ io.write(&v_size_el, sizeof(v_size_el));
2152
+
2153
+ // Write GQA embedding size
2154
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
2155
+
2156
+ // For each row, we get the element values of each cell
2157
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
2158
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
2159
+ for (const auto & range : cell_ranges) {
2160
+ const size_t range_size = range.second - range.first;
2161
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
2162
+ const size_t buf_size = range_size * v_size_el;
2163
+ io.write_tensor(v_l[il], src_offset, buf_size);
2164
+ }
2165
+ }
2166
+ }
2167
+ }
2168
+ }
2169
+
2170
+ bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
2171
+ if (dest_seq_id != -1) {
2172
+ // single sequence
2173
+
2174
+ seq_rm(dest_seq_id, -1, -1);
2175
+
2176
+ llama_sbatch sbatch;
2177
+ llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
2178
+
2179
+ batch.n_tokens = cell_count;
2180
+ batch.n_seq_tokens = cell_count;
2181
+ batch.n_seqs = 1;
2182
+
2183
+ for (uint32_t i = 0; i < cell_count; ++i) {
2184
+ llama_pos pos;
2185
+ uint32_t n_seq_id;
2186
+
2187
+ io.read_to(&pos, sizeof(pos));
2188
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
2189
+
2190
+ if (n_seq_id != 0) {
2191
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
2192
+ return false;
2193
+ }
2194
+
2195
+ batch.pos[i] = pos;
2196
+ }
2197
+ batch.n_seq_id[0] = 1;
2198
+ batch.seq_id[0] = &dest_seq_id;
2199
+ if (!find_slot(batch)) {
2200
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
2201
+ return false;
2202
+ }
2203
+ commit();
2204
+
2205
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
2206
+ // Assume that this is one contiguous block of cells
2207
+ GGML_ASSERT(head + cell_count <= size);
2208
+ GGML_ASSERT(cells[head].pos == batch.pos[0]);
2209
+ GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
2210
+ GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
2211
+ GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
2212
+ } else {
2213
+ // whole KV cache restore
2214
+
2215
+ if (cell_count > size) {
2216
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
2217
+ return false;
2218
+ }
2219
+
2220
+ clear();
2221
+
2222
+ for (uint32_t i = 0; i < cell_count; ++i) {
2223
+ kv_cell & cell = cells[i];
2224
+
2225
+ llama_pos pos;
2226
+ uint32_t n_seq_id;
2227
+
2228
+ io.read_to(&pos, sizeof(pos));
2229
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
2230
+
2231
+ cell.pos = pos;
2232
+
2233
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
2234
+ llama_seq_id seq_id;
2235
+ io.read_to(&seq_id, sizeof(seq_id));
2236
+
2237
+ // TODO: llama_kv_cache_recurrent should have a notion of max sequences
2238
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
2239
+ if (seq_id < 0) {
2240
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
2241
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
2242
+ return false;
2243
+ }
2244
+
2245
+ cell.seq_id.insert(seq_id);
2246
+
2247
+ int32_t & tail = cells[seq_id].tail;
2248
+ if (tail != -1) {
2249
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
2250
+ return false;
2251
+ }
2252
+ tail = i;
2253
+ }
2254
+ }
2255
+
2256
+ head = 0;
2257
+ used = cell_count;
2258
+ }
2259
+
2260
+ for (uint32_t i = 0; i < cell_count; ++i) {
2261
+ uint32_t cell_id = head + i;
2262
+ // make sure the recurrent states will keep their restored state
2263
+ cells[cell_id].src = cell_id;
2264
+ }
2265
+
2266
+ return true;
2267
+ }
2268
+
2269
+ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
2270
+ uint32_t v_trans;
2271
+ uint32_t n_layer;
2272
+ io.read_to(&v_trans, sizeof(v_trans));
2273
+ io.read_to(&n_layer, sizeof(n_layer));
2274
+
2275
+ if (n_layer != hparams.n_layer) {
2276
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
2277
+ return false;
2278
+ }
2279
+ if (cell_count > size) {
2280
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
2281
+ return false;
2282
+ }
2283
+ if (false != (bool) v_trans) {
2284
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
2285
+ return false;
2286
+ }
2287
+
2288
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
2289
+ for (uint32_t il = 0; il < n_layer; ++il) {
2290
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
2291
+
2292
+ // Read type of key
2293
+ int32_t k_type_i_ref;
2294
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
2295
+ const int32_t k_type_i = (int32_t) k_l[il]->type;
2296
+ if (k_type_i != k_type_i_ref) {
2297
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
2298
+ return false;
2299
+ }
2300
+
2301
+ // Read row size of key
2302
+ uint64_t k_size_row_ref;
2303
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
2304
+ const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
2305
+ if (k_size_row != k_size_row_ref) {
2306
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
2307
+ return false;
2308
+ }
2309
+
2310
+ if (cell_count) {
2311
+ // Read and set the keys for the whole cell range
2312
+ ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
2313
+ }
2314
+ }
2315
+
2316
+ if (!v_trans) {
2317
+ for (uint32_t il = 0; il < n_layer; ++il) {
2318
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
2319
+
2320
+ // Read type of value
2321
+ int32_t v_type_i_ref;
2322
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
2323
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
2324
+ if (v_type_i != v_type_i_ref) {
2325
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
2326
+ return false;
2327
+ }
2328
+
2329
+ // Read row size of value
2330
+ uint64_t v_size_row_ref;
2331
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
2332
+ const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
2333
+ if (v_size_row != v_size_row_ref) {
2334
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
2335
+ return false;
2336
+ }
2337
+
2338
+ if (cell_count) {
2339
+ // Read and set the values for the whole cell range
2340
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
2341
+ }
2342
+ }
2343
+ } else {
2344
+ // For each layer, read the values for each cell (transposed)
2345
+ for (uint32_t il = 0; il < n_layer; ++il) {
2346
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
2347
+
2348
+ // Read type of value
2349
+ int32_t v_type_i_ref;
2350
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
2351
+ const int32_t v_type_i = (int32_t)v_l[il]->type;
2352
+ if (v_type_i != v_type_i_ref) {
2353
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
2354
+ return false;
2355
+ }
2356
+
2357
+ // Read element size of value
2358
+ uint32_t v_size_el_ref;
2359
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
2360
+ const size_t v_size_el = ggml_type_size(v_l[il]->type);
2361
+ if (v_size_el != v_size_el_ref) {
2362
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
2363
+ return false;
2364
+ }
2365
+
2366
+ // Read GQA embedding size
2367
+ uint32_t n_embd_v_gqa_ref;
2368
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
2369
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
2370
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
2371
+ return false;
2372
+ }
2373
+
2374
+ if (cell_count) {
2375
+ // For each row in the transposed matrix, read the values for the whole cell range
2376
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
2377
+ const size_t dst_offset = (head + j * size) * v_size_el;
2378
+ ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
2379
+ }
2380
+ }
2381
+ }
2382
+ }
2383
+
2384
+ return true;
2385
+ }
2386
+
2387
+ //
2388
+ // kv cache view
2389
+ //
2390
+
2391
+ llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
2392
+ llama_kv_cache_view result = {
2393
+ /*.n_cells = */ 0,
2394
+ /*.n_seq_max = */ n_seq_max,
2395
+ /*.token_count = */ 0,
2396
+ /*.used_cells = */ kv.get_used_cells(),
2397
+ /*.max_contiguous = */ 0,
2398
+ /*.max_contiguous_idx = */ -1,
2399
+ /*.cells = */ nullptr,
2400
+ /*.cells_sequences = */ nullptr,
2401
+ };
2402
+
2403
+ return result;
2404
+ }
2405
+
2406
+ void llama_kv_cache_view_free(llama_kv_cache_view * view) {
2407
+ if (view->cells != nullptr) {
2408
+ free(view->cells);
2409
+ view->cells = nullptr;
2410
+ }
2411
+ if (view->cells_sequences != nullptr) {
2412
+ free(view->cells_sequences);
2413
+ view->cells_sequences = nullptr;
2414
+ }
2415
+ }
2416
+
2417
+ void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
2418
+ // TODO: rework this in the future, for now quick hack
2419
+ const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
2420
+ if (kvu == nullptr) {
2421
+ LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
2422
+ return;
2423
+ }
2424
+
2425
+ if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
2426
+ view->n_cells = int32_t(kvu->size);
2427
+ void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
2428
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
2429
+ view->cells = (llama_kv_cache_view_cell *)p;
2430
+ p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
2431
+ GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
2432
+ view->cells_sequences = (llama_seq_id *)p;
2433
+ }
2434
+
2435
+ const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
2436
+ llama_kv_cache_view_cell * c_curr = view->cells;
2437
+ llama_seq_id * cs_curr = view->cells_sequences;
2438
+ int32_t used_cells = 0;
2439
+ int32_t token_count = 0;
2440
+ int32_t curr_contig_idx = -1;
2441
+ uint32_t max_contig = 0;
2442
+ int32_t max_contig_idx = -1;
2443
+
2444
+ for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
2445
+ const size_t curr_size = kv_cells[i].seq_id.size();
2446
+ token_count += curr_size;
2447
+ c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
2448
+
2449
+ if (curr_size > 0) {
2450
+ if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
2451
+ max_contig = i - curr_contig_idx;
2452
+ max_contig_idx = curr_contig_idx;
2453
+ }
2454
+ curr_contig_idx = -1;
2455
+ } else if (curr_contig_idx < 0) {
2456
+ curr_contig_idx = i;
2457
+ }
2458
+
2459
+ int seq_idx = 0;
2460
+ for (const llama_seq_id it : kv_cells[i].seq_id) {
2461
+ if (seq_idx >= view->n_seq_max) {
2462
+ break;
2463
+ }
2464
+ cs_curr[seq_idx] = it;
2465
+ seq_idx++;
2466
+ }
2467
+ if (seq_idx != 0) {
2468
+ used_cells++;
2469
+ }
2470
+ for (; seq_idx < view->n_seq_max; seq_idx++) {
2471
+ cs_curr[seq_idx] = -1;
2472
+ }
2473
+ }
2474
+ if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
2475
+ max_contig_idx = curr_contig_idx;
2476
+ max_contig = kv_cells.size() - curr_contig_idx;
2477
+ }
2478
+ view->max_contiguous = max_contig;
2479
+ view->max_contiguous_idx = max_contig_idx;
2480
+ view->token_count = token_count;
2481
+ view->used_cells = used_cells;
2482
+ if (uint32_t(used_cells) != kvu->used) {
2483
+ LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
2484
+ __func__, kvu->used, used_cells);
2485
+ }
2486
+ }