@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -23,32 +23,21 @@ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
23
23
  }
24
24
 
25
25
  llama_kv_cache_unified::llama_kv_cache_unified(
26
- const llama_model & model,
27
- ggml_type type_k,
28
- ggml_type type_v,
29
- bool v_trans,
30
- bool offload,
31
- uint32_t kv_size,
32
- uint32_t padding) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding) {
33
- const int32_t n_layer = hparams.n_layer;
34
-
35
- has_shift = false;
36
- can_shift = true;
37
-
38
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d, padding = %d\n",
39
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift, padding);
40
-
41
- GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
42
-
43
- head = 0;
44
- size = kv_size;
45
- used = 0;
46
-
47
- this->type_k = type_k;
48
- this->type_v = type_v;
49
-
50
- cells.clear();
51
- cells.resize(kv_size);
26
+ const llama_model & model,
27
+ layer_filter_cb && filter,
28
+ ggml_type type_k,
29
+ ggml_type type_v,
30
+ bool v_trans,
31
+ bool offload,
32
+ uint32_t kv_size,
33
+ uint32_t n_seq_max,
34
+ uint32_t n_pad,
35
+ uint32_t n_swa,
36
+ llama_swa_type swa_type) :
37
+ model(model), hparams(model.hparams), v_trans(v_trans),
38
+ n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
39
+
40
+ GGML_ASSERT(kv_size % n_pad == 0);
52
41
 
53
42
  // create a context for each buffer type
54
43
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -56,7 +45,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
56
45
  auto it = ctx_map.find(buft);
57
46
  if (it == ctx_map.end()) {
58
47
  ggml_init_params params = {
59
- /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
48
+ /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
60
49
  /*.mem_buffer =*/ NULL,
61
50
  /*.no_alloc =*/ true,
62
51
  };
@@ -75,37 +64,48 @@ llama_kv_cache_unified::llama_kv_cache_unified(
75
64
  return it->second;
76
65
  };
77
66
 
78
- k_l.reserve(n_layer);
79
- v_l.reserve(n_layer);
67
+ head = 0;
80
68
 
81
- for (int i = 0; i < n_layer; i++) {
82
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
83
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
69
+ cells.resize(kv_size);
70
+
71
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
72
+ if (filter && !filter(il)) {
73
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
74
+ continue;
75
+ }
76
+
77
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
78
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
84
79
 
85
80
  const char * dev_name = "CPU";
86
81
 
87
82
  ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
88
83
 
89
84
  if (offload) {
90
- auto * dev = model.dev_layer(i);
85
+ auto * dev = model.dev_layer(il);
91
86
  buft = ggml_backend_dev_buffer_type(dev);
92
87
 
93
88
  dev_name = ggml_backend_dev_name(dev);
94
89
  }
95
90
 
96
- LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, i, dev_name);
91
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
97
92
 
98
93
  ggml_context * ctx = ctx_for_buft(buft);
99
94
  if (!ctx) {
100
95
  throw std::runtime_error("failed to create ggml context for kv cache");
101
96
  }
102
97
 
103
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
104
- ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
105
- ggml_format_name(k, "cache_k_l%d", i);
106
- ggml_format_name(v, "cache_v_l%d", i);
107
- k_l.push_back(k);
108
- v_l.push_back(v);
98
+ ggml_tensor * k;
99
+ ggml_tensor * v;
100
+
101
+ k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
102
+ v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
103
+
104
+ ggml_format_name(k, "cache_k_l%d", il);
105
+ ggml_format_name(v, "cache_v_l%d", il);
106
+
107
+ map_layer_ids[il] = layers.size();
108
+ layers.push_back({ il, k, v });
109
109
  }
110
110
 
111
111
  // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -117,8 +117,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
117
117
  if (!buf) {
118
118
  throw std::runtime_error("failed to allocate buffer for kv cache");
119
119
  }
120
- ggml_backend_buffer_clear(buf, 0);
120
+
121
121
  LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
122
+
123
+ ggml_backend_buffer_clear(buf, 0);
122
124
  bufs.emplace_back(buf);
123
125
  }
124
126
 
@@ -126,20 +128,17 @@ llama_kv_cache_unified::llama_kv_cache_unified(
126
128
  const size_t memory_size_k = size_k_bytes();
127
129
  const size_t memory_size_v = size_v_bytes();
128
130
 
129
- LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
130
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
131
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
132
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
131
133
  ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
132
134
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
133
135
  }
134
136
  }
135
137
 
136
138
  void llama_kv_cache_unified::clear() {
137
- for (int32_t i = 0; i < (int32_t) size; ++i) {
138
- cells[i].pos = -1;
139
- cells[i].seq_id.clear();
140
- }
139
+ cells.reset();
140
+
141
141
  head = 0;
142
- used = 0;
143
142
 
144
143
  for (auto & buf : bufs) {
145
144
  ggml_backend_buffer_clear(buf.get(), 0);
@@ -147,7 +146,7 @@ void llama_kv_cache_unified::clear() {
147
146
  }
148
147
 
149
148
  bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
150
- uint32_t new_head = size;
149
+ uint32_t new_head = cells.size();
151
150
 
152
151
  if (p0 < 0) {
153
152
  p0 = 0;
@@ -157,32 +156,20 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
157
156
  p1 = std::numeric_limits<llama_pos>::max();
158
157
  }
159
158
 
160
- for (uint32_t i = 0; i < size; ++i) {
161
- if (cells[i].pos >= p0 && cells[i].pos < p1) {
162
- if (seq_id < 0) {
163
- cells[i].seq_id.clear();
164
- } else if (cells[i].has_seq_id(seq_id)) {
165
- cells[i].seq_id.erase(seq_id);
166
- } else {
167
- continue;
168
- }
169
- if (cells[i].is_empty()) {
170
- // keep count of the number of used cells
171
- if (cells[i].pos >= 0) {
172
- used--;
173
- }
174
-
175
- cells[i].pos = -1;
159
+ for (uint32_t i = 0; i < cells.size(); ++i) {
160
+ if (!cells.pos_in(i, p0, p1)) {
161
+ continue;
162
+ }
176
163
 
177
- if (new_head == size) {
178
- new_head = i;
179
- }
164
+ if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
165
+ if (new_head == cells.size()) {
166
+ new_head = i;
180
167
  }
181
168
  }
182
169
  }
183
170
 
184
171
  // If we freed up a slot, set head to it so searching can start there.
185
- if (new_head != size && new_head < head) {
172
+ if (new_head != cells.size() && new_head < head) {
186
173
  head = new_head;
187
174
  }
188
175
 
@@ -202,49 +189,40 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
202
189
  p1 = std::numeric_limits<llama_pos>::max();
203
190
  }
204
191
 
205
- // otherwise, this is the KV of a Transformer-like model
206
- head = 0;
192
+ for (uint32_t i = 0; i < cells.size(); ++i) {
193
+ if (!cells.pos_in(i, p0, p1)) {
194
+ continue;
195
+ }
207
196
 
208
- for (uint32_t i = 0; i < size; ++i) {
209
- if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
210
- cells[i].seq_id.insert(seq_id_dst);
197
+ if (cells.seq_has(i, seq_id_src)) {
198
+ cells.seq_add(i, seq_id_dst);
211
199
  }
212
200
  }
213
201
  }
214
202
 
215
203
  void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
216
- uint32_t new_head = size;
204
+ uint32_t new_head = cells.size();
217
205
 
218
- for (uint32_t i = 0; i < size; ++i) {
219
- if (!cells[i].has_seq_id(seq_id)) {
220
- if (cells[i].pos >= 0) {
221
- used--;
222
- }
223
-
224
- cells[i].pos = -1;
225
- cells[i].seq_id.clear();
226
-
227
- if (new_head == size){
206
+ for (uint32_t i = 0; i < cells.size(); ++i) {
207
+ if (cells.seq_keep(i, seq_id)) {
208
+ if (new_head == cells.size()) {
228
209
  new_head = i;
229
210
  }
230
- } else {
231
- cells[i].seq_id.clear();
232
- cells[i].seq_id.insert(seq_id);
233
211
  }
234
212
  }
235
213
 
236
214
  // If we freed up a slot, set head to it so searching can start there.
237
- if (new_head != size && new_head < head) {
215
+ if (new_head != cells.size() && new_head < head) {
238
216
  head = new_head;
239
217
  }
240
218
  }
241
219
 
242
- void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
243
- if (delta == 0) {
220
+ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
221
+ if (shift == 0) {
244
222
  return;
245
223
  }
246
224
 
247
- uint32_t new_head = size;
225
+ uint32_t new_head = cells.size();
248
226
 
249
227
  if (p0 < 0) {
250
228
  p0 = 0;
@@ -254,24 +232,19 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
254
232
  p1 = std::numeric_limits<llama_pos>::max();
255
233
  }
256
234
 
257
- // If there is no range then return early to avoid looping over the
235
+ // If there is no range then return early to avoid looping over all cells.
258
236
  if (p0 == p1) {
259
237
  return;
260
238
  }
261
239
 
262
- for (uint32_t i = 0; i < size; ++i) {
263
- if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
264
- has_shift = true;
265
- cells[i].pos += delta;
266
- cells[i].delta += delta;
240
+ for (uint32_t i = 0; i < cells.size(); ++i) {
241
+ if (!cells.pos_in(i, p0, p1)) {
242
+ continue;
243
+ }
267
244
 
268
- if (cells[i].pos < 0) {
269
- if (!cells[i].is_empty()) {
270
- used--;
271
- }
272
- cells[i].pos = -1;
273
- cells[i].seq_id.clear();
274
- if (new_head == size) {
245
+ if (cells.seq_has(i, seq_id)) {
246
+ if (cells.pos_add(i, shift)) {
247
+ if (new_head == cells.size()) {
275
248
  new_head = i;
276
249
  }
277
250
  }
@@ -280,7 +253,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
280
253
 
281
254
  // If we freed up a slot, set head to it so searching can start there.
282
255
  // Otherwise we just start the next search from the beginning.
283
- head = new_head != size ? new_head : 0;
256
+ head = new_head != cells.size() ? new_head : 0;
284
257
  }
285
258
 
286
259
  void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
@@ -301,66 +274,41 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
301
274
  return;
302
275
  }
303
276
 
304
- for (uint32_t i = 0; i < size; ++i) {
305
- if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
306
- has_shift = true;
277
+ for (uint32_t i = 0; i < cells.size(); ++i) {
278
+ if (!cells.pos_in(i, p0, p1)) {
279
+ continue;
280
+ }
307
281
 
308
- {
309
- llama_pos p_old = cells[i].pos;
310
- cells[i].pos /= d;
311
- cells[i].delta += cells[i].pos - p_old;
312
- }
282
+ if (cells.seq_has(i, seq_id)) {
283
+ cells.pos_div(i, d);
313
284
  }
314
285
  }
315
286
  }
316
287
 
317
- llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
318
- llama_pos result = 0;
319
-
320
- for (uint32_t i = 0; i < size; ++i) {
321
- if (cells[i].has_seq_id(seq_id)) {
322
- result = std::max(result, cells[i].pos);
323
- }
324
- }
288
+ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
289
+ return cells.seq_pos_min(seq_id);
290
+ }
325
291
 
326
- return result;
292
+ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
293
+ return cells.seq_pos_max(seq_id);
327
294
  }
328
295
 
329
296
  void llama_kv_cache_unified::restore() {
330
- if (pending.ranges.empty()) {
331
- return;
332
- }
333
-
334
- uint32_t new_head = size;
335
-
336
- for (auto & range : pending.ranges) {
337
- for (uint32_t i = range.c0; i < range.c1; ++i) {
338
- cells[i].seq_id.clear();
339
-
340
- // keep count of the number of used cells
341
- if (cells[i].pos >= 0) {
342
- used--;
343
- }
344
-
345
- cells[i].pos = -1;
346
- }
347
-
348
- new_head = std::min(new_head, range.c0);
297
+ for (auto & state : recovery.states) {
298
+ cells.set(state.i, state.cells);
349
299
  }
350
300
 
351
- if (new_head != size && new_head < head) {
352
- head = new_head;
353
- }
301
+ recovery.clear();
354
302
  }
355
303
 
356
304
  void llama_kv_cache_unified::commit() {
357
- if (pending.ranges.empty()) {
358
- LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
359
- __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
305
+ if (recovery.states.empty()) {
306
+ LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
307
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
360
308
  return;
361
309
  }
362
310
 
363
- pending.ranges.clear();
311
+ recovery.clear();
364
312
  }
365
313
 
366
314
  bool llama_kv_cache_unified::update(llama_context & lctx) {
@@ -368,7 +316,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
368
316
 
369
317
  auto * sched = lctx.get_sched();
370
318
 
371
- if (has_shift) {
319
+ if (cells.get_has_shift()) {
372
320
  if (!get_can_shift()) {
373
321
  GGML_ABORT("The current KV cache / model configuration does not support K-shift");
374
322
  }
@@ -392,13 +340,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
392
340
  need_reserve = true;
393
341
  }
394
342
 
395
- {
396
- has_shift = false;
397
-
398
- for (uint32_t i = 0; i < size; ++i) {
399
- cells[i].delta = 0;
400
- }
401
- }
343
+ cells.reset_shift();
402
344
  }
403
345
 
404
346
  if (do_defrag) {
@@ -429,7 +371,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
429
371
  void llama_kv_cache_unified::defrag_sched(float thold) {
430
372
  // - do not defrag small contexts (i.e. < 2048 tokens)
431
373
  // - count the padding towards the number of used tokens
432
- const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
374
+ const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n)) : 0.0f;
433
375
 
434
376
  // queue defragmentation for next llama_kv_cache_update
435
377
  if (fragmentation > thold) {
@@ -440,54 +382,77 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
440
382
  }
441
383
 
442
384
  void llama_kv_cache_unified::set_full() {
443
- n = size;
385
+ n = cells.size();
386
+
387
+ // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
388
+ // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
389
+ // we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
390
+ // setting it to 0 is the simplest way to achieve that
391
+ // ref: https://github.com/ggml-org/llama.cpp/issues/13359
392
+ head = 0;
444
393
  }
445
394
 
446
- llama_sbatch llama_kv_cache_unified::sbatch_init(
447
- const llama_batch & batch,
448
- bool logits_all) {
395
+ llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
449
396
  return llama_sbatch(batch, hparams.n_embd, true, logits_all);
450
397
  }
451
398
 
452
- llama_ubatch llama_kv_cache_unified::ubatch_next(
453
- llama_sbatch & sbatch,
454
- uint32_t n_ubatch,
455
- bool embd_pooled) const {
399
+ llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
456
400
  GGML_UNUSED(embd_pooled);
457
401
  return sbatch.split_simple(n_ubatch);
458
402
  }
459
403
 
460
- bool llama_kv_cache_unified::find_slot(
461
- const llama_ubatch & ubatch) {
404
+ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
462
405
  const uint32_t n_tokens = ubatch.n_tokens;
463
- const uint32_t n_seqs = ubatch.n_seqs;
464
- const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
465
406
 
466
407
  // if we have enough unused cells before the current head ->
467
408
  // better to start searching from the beginning of the cache, hoping to fill it
468
- if (head > used + 2*ubatch.n_tokens) {
409
+ if (head > cells.get_used() + 2*ubatch.n_tokens) {
469
410
  head = 0;
470
411
  }
471
412
 
472
413
  // otherwise, one cell per token.
473
414
 
474
- if (n_tokens > size) {
475
- LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
415
+ if (n_tokens > cells.size()) {
416
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
476
417
  return false;
477
418
  }
478
419
 
420
+ //#define FIND_SLOT_DEBUG 1
421
+ #if FIND_SLOT_DEBUG
422
+ LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
423
+
424
+ // for debugging
425
+ {
426
+ std::string ss;
427
+ if (n_swa > 0) {
428
+ for (uint32_t i = 0; i < size; ++i) {
429
+ if (cells.is_empty(i)) {
430
+ ss += '.';
431
+ } else {
432
+ ss += 'x';
433
+ }
434
+ if (i%256 == 255) {
435
+ ss += '\n';
436
+ }
437
+ }
438
+ }
439
+ LLAMA_LOG_WARN("\n%s\n", ss.c_str());
440
+ }
441
+ #endif
442
+
479
443
  uint32_t n_tested = 0;
480
444
 
481
445
  while (true) {
482
- if (head + n_tokens > size) {
483
- n_tested += size - head;
446
+ if (head + n_tokens > cells.size()) {
447
+ n_tested += cells.size() - head;
484
448
  head = 0;
485
449
  continue;
486
450
  }
487
451
 
488
452
  bool found = true;
489
453
  for (uint32_t i = 0; i < n_tokens; i++) {
490
- if (cells[head + i].pos >= 0) {
454
+ // TODO: improve to accept cells that are masked by the SWA
455
+ if (!cells.is_empty(head + i)) {
491
456
  found = false;
492
457
  head += i + 1;
493
458
  n_tested += i + 1;
@@ -499,66 +464,257 @@ bool llama_kv_cache_unified::find_slot(
499
464
  break;
500
465
  }
501
466
 
502
- if (n_tested >= size) {
467
+ if (n_tested >= cells.size()) {
503
468
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
504
469
  return false;
505
470
  }
506
471
  }
507
472
 
508
- for (uint32_t s = 0; s < n_seqs; s++) {
509
- for (uint32_t i = 0; i < n_seq_tokens; ++i) {
510
- uint32_t k = s*n_seq_tokens + i;
511
- cells[head + k].pos = ubatch.pos[k];
473
+ // store the old state of the cells in the recovery stack
474
+ recovery.states.push_back({head, cells.cp(head, n_tokens)});
512
475
 
513
- for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
514
- cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
515
- }
476
+ for (uint32_t i = 0; i < n_tokens; ++i) {
477
+ cells.pos_set(head + i, ubatch.pos[i]);
478
+
479
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
480
+ cells.seq_add(head + i, ubatch.seq_id[i][j]);
516
481
  }
517
482
  }
518
483
 
519
- used += n_tokens;
520
-
521
- pending.ranges.push_back({head, head + n_tokens});
522
-
523
484
  // a heuristic, to avoid attending the full cache if it is not yet utilized
524
485
  // after enough generations, the benefit from this heuristic disappears
525
486
  // if we start defragmenting the cache, the benefit from this will be more important
526
- n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
487
+ n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
527
488
 
528
- //printf("n = %5d, used = %5d, head = %5d\n", n, used, head);
489
+ #ifdef FIND_SLOT_DEBUG
490
+ LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
491
+ #endif
529
492
 
530
493
  return true;
531
494
  }
532
495
 
533
- int32_t llama_kv_cache_unified::get_n_tokens() const {
534
- int32_t result = 0;
496
+ bool llama_kv_cache_unified::get_can_shift() const {
497
+ return true;
498
+ }
499
+
500
+ uint32_t llama_kv_cache_unified::get_n() const {
501
+ return n;
502
+ }
503
+
504
+ uint32_t llama_kv_cache_unified::get_size() const {
505
+ return cells.size();
506
+ }
507
+
508
+ ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
509
+ const int32_t ikv = map_layer_ids.at(il);
510
+
511
+ auto * k = layers[ikv].k;
512
+
513
+ return ggml_view_3d(ctx, k,
514
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n,
515
+ ggml_row_size(k->type, hparams.n_embd_head_k),
516
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
517
+ 0);
518
+ }
519
+
520
+ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
521
+ const int32_t ikv = map_layer_ids.at(il);
522
+
523
+ auto * v = layers[ikv].v;
535
524
 
536
- for (uint32_t i = 0; i < size; i++) {
537
- result += cells[i].seq_id.size();
525
+ if (!v_trans) {
526
+ // note: v->nb[1] <= v->nb[2]
527
+ return ggml_view_3d(ctx, v,
528
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n,
529
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
530
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
531
+ 0);
538
532
  }
539
533
 
540
- return result;
534
+ // note: v->nb[1] > v->nb[2]
535
+ return ggml_view_3d(ctx, v,
536
+ n, hparams.n_head_kv(il), hparams.n_embd_head_v,
537
+ ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
538
+ ggml_row_size(v->type, v->ne[1]), // v->nb[2]
539
+ 0);
541
540
  }
542
541
 
543
- int32_t llama_kv_cache_unified::get_used_cells() const {
544
- return used;
542
+ ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
543
+ const int32_t ikv = map_layer_ids.at(il);
544
+
545
+ auto * k = layers[ikv].k;
546
+
547
+ const int64_t n_tokens = k_cur->ne[2];
548
+
549
+ ggml_tensor * k_view = ggml_view_1d(ctx, k,
550
+ n_tokens*hparams.n_embd_k_gqa(il),
551
+ ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
552
+
553
+ return ggml_cpy(ctx, k_cur, k_view);
545
554
  }
546
555
 
547
- bool llama_kv_cache_unified::get_can_shift() const {
548
- return can_shift;
556
+ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
557
+ const int32_t ikv = map_layer_ids.at(il);
558
+
559
+ auto * v = layers[ikv].v;
560
+
561
+ const int64_t n_tokens = v_cur->ne[2];
562
+
563
+ v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
564
+
565
+ ggml_tensor * v_view = nullptr;
566
+
567
+ if (!v_trans) {
568
+ v_view = ggml_view_1d(ctx, v,
569
+ n_tokens*hparams.n_embd_v_gqa(il),
570
+ ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
571
+ } else {
572
+ // note: the V cache is transposed when not using flash attention
573
+ v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
574
+ (v->ne[1])*ggml_element_size(v),
575
+ ( head)*ggml_element_size(v));
576
+
577
+ v_cur = ggml_transpose(ctx, v_cur);
578
+ }
579
+
580
+ return ggml_cpy(ctx, v_cur, v_view);
549
581
  }
550
582
 
551
- llama_pos llama_kv_cache_unified::get_pos_max() const {
552
- llama_pos pos_max = -1;
553
- for (const auto & cell : cells) {
554
- pos_max = std::max(pos_max, cell.pos);
583
+ void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
584
+ // no pruning is needed when the cache does not use SWA
585
+ GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
586
+
587
+ int n_attended = 0;
588
+
589
+ for (uint32_t i = 0; i < cells.size(); ++i) {
590
+ if (!cells.seq_has(i, seq_id)) {
591
+ continue;
592
+ }
593
+
594
+ const llama_pos p0 = cells.pos_get(i);
595
+
596
+ if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
597
+ n_attended++;
598
+ }
599
+
600
+ if (is_masked_swa(p0, pmax)) {
601
+ cells.seq_rm(i, seq_id);
602
+ }
555
603
  }
556
604
 
557
- return pos_max;
605
+ if (n_attended < std::min<int>(n_swa, pmin)) {
606
+ LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
607
+ }
608
+ }
609
+
610
+ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
611
+ const int64_t n_tokens = ubatch->n_tokens;
612
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
613
+ const int64_t n_seqs = ubatch->n_seqs;
614
+
615
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
616
+ float * data = (float *) dst->data;
617
+
618
+ const int64_t n_kv = n;
619
+
620
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
621
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
622
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
623
+ // Causal mask:
624
+ // xxx-------
625
+ // xxxx------
626
+ // xxxxx-----
627
+ // Non-causal mask:
628
+ // xxxxx-----
629
+ // xxxxx-----
630
+ // xxxxx-----
631
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
632
+ for (int h = 0; h < 1; ++h) {
633
+ for (int s = 0; s < n_seqs; ++s) {
634
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
635
+
636
+ for (int j = 0; j < n_seq_tokens; ++j) {
637
+ const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
638
+
639
+ for (int i = 0; i < n_kv; ++i) {
640
+ float f = 0.0f;
641
+
642
+ bool masked = false;
643
+
644
+ if (cells.is_empty(i)) {
645
+ masked = true;
646
+ } else {
647
+ const llama_pos p0 = cells.pos_get(i);
648
+
649
+ // mask the token if not the same sequence
650
+ masked = masked || (!cells.seq_has(i, seq_id));
651
+
652
+ // mask future tokens
653
+ masked = masked || (causal_attn && p0 > p1);
654
+
655
+ // apply SWA if any
656
+ masked = masked || (is_masked_swa(p0, p1));
657
+
658
+ if (!masked && hparams.use_alibi) {
659
+ f = -std::abs(p0 - p1);
660
+ }
661
+ }
662
+
663
+ if (masked) {
664
+ f = -INFINITY;
665
+ }
666
+
667
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
668
+ }
669
+ }
670
+ }
671
+
672
+ // mask padded tokens
673
+ if (data) {
674
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
675
+ for (int j = 0; j < n_kv; ++j) {
676
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
677
+ }
678
+ }
679
+ }
680
+ }
681
+ }
682
+
683
+ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
684
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
685
+
686
+ int32_t * data = (int32_t *) dst->data;
687
+
688
+ for (uint32_t i = 0; i < cells.size(); ++i) {
689
+ data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
690
+ }
691
+ }
692
+
693
+ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
694
+ const int64_t n_tokens = ubatch->n_tokens;
695
+
696
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
697
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
698
+
699
+ int32_t * data = (int32_t *) dst->data;
700
+
701
+ const int64_t n_kv = n;
702
+
703
+ for (int h = 0; h < 1; ++h) {
704
+ for (int j = 0; j < n_tokens; ++j) {
705
+ for (int i = 0; i < n_kv; ++i) {
706
+ // the position when the cells is empty is irrelevant - it will be masked out later in the attention
707
+ const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i);
708
+
709
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
710
+ }
711
+ }
712
+ }
558
713
  }
559
714
 
560
715
  size_t llama_kv_cache_unified::total_size() const {
561
716
  size_t size = 0;
717
+
562
718
  for (const auto & buf : bufs) {
563
719
  size += ggml_backend_buffer_get_size(buf.get());
564
720
  }
@@ -569,8 +725,8 @@ size_t llama_kv_cache_unified::total_size() const {
569
725
  size_t llama_kv_cache_unified::size_k_bytes() const {
570
726
  size_t size_k_bytes = 0;
571
727
 
572
- for (const auto & k : k_l) {
573
- size_k_bytes += ggml_nbytes(k);
728
+ for (const auto & layer : layers) {
729
+ size_k_bytes += ggml_nbytes(layer.k);
574
730
  }
575
731
 
576
732
  return size_k_bytes;
@@ -579,8 +735,8 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
579
735
  size_t llama_kv_cache_unified::size_v_bytes() const {
580
736
  size_t size_v_bytes = 0;
581
737
 
582
- for (const auto & v : v_l) {
583
- size_v_bytes += ggml_nbytes(v);
738
+ for (const auto & layer : layers) {
739
+ size_v_bytes += ggml_nbytes(layer.v);
584
740
  }
585
741
 
586
742
  return size_v_bytes;
@@ -601,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
601
757
  const auto & yarn_beta_slow = cparams.yarn_beta_slow;
602
758
 
603
759
  const auto & n_rot = hparams.n_rot;
604
- const auto & rope_type = hparams.rope_type;
760
+ const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
761
+ // @ngxson : this is a workaround
762
+ // for M-RoPE, we want to rotate the whole vector when doing KV shift
763
+ // a normal RoPE should work, we just need to use the correct ordering
764
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13870
765
+ ? LLAMA_ROPE_TYPE_NEOX
766
+ : hparams.rope_type;
605
767
 
606
768
  // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
607
769
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
608
- const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
770
+ const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
771
+ ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
772
+ : cparams.yarn_attn_factor;
609
773
 
610
774
  ggml_tensor * tmp;
611
775
 
@@ -644,13 +808,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
644
808
  GGML_UNUSED(ubatch);
645
809
 
646
810
  if (k_shift) {
647
- assert(ggml_backend_buffer_is_host(k_shift->buffer));
648
-
649
- int32_t * data = (int32_t *) k_shift->data;
650
-
651
- for (uint32_t i = 0; i < kv_self->size; ++i) {
652
- data[i] = kv_self->cells[i].delta;
653
- }
811
+ kv_self->set_input_k_shift(k_shift);
654
812
  }
655
813
  }
656
814
 
@@ -660,13 +818,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
660
818
  ggml_cgraph * gf) const {
661
819
  auto res = std::make_unique<llm_graph_result>();
662
820
 
663
- const auto & n_layer = hparams.n_layer;
664
-
665
821
  const auto & n_embd_head_k = hparams.n_embd_head_k;
666
822
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
667
823
 
668
- const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
669
-
670
824
  //GGML_ASSERT(kv_self->size == n_ctx);
671
825
 
672
826
  auto inp = std::make_unique<llm_graph_input_k_shift>(this);
@@ -674,24 +828,22 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
674
828
  inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
675
829
  ggml_set_input(inp->k_shift);
676
830
 
677
- for (uint32_t il = 0; il < n_layer; ++il) {
831
+ for (const auto & layer : layers) {
832
+ const uint32_t il = layer.il;
833
+
678
834
  const int64_t n_head_kv = hparams.n_head_kv(il);
679
835
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
680
836
 
681
- const bool is_swa = hparams.is_swa(il);
682
-
683
- // note: the swa rope params could become part of the cparams in the future
684
- // if we decide to make them configurable, like the non-sliding ones
685
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
686
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
837
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
838
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
687
839
 
688
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
840
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
689
841
 
690
842
  ggml_tensor * k =
691
- ggml_view_3d(ctx, k_l[il],
692
- n_embd_head_k, n_head_kv, size,
693
- ggml_row_size(k_l[il]->type, n_embd_head_k),
694
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
843
+ ggml_view_3d(ctx, layer.k,
844
+ n_embd_head_k, n_head_kv, cells.size(),
845
+ ggml_row_size(layer.k->type, n_embd_head_k),
846
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
695
847
  0);
696
848
 
697
849
  ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
@@ -796,44 +948,46 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
796
948
  nm++;
797
949
  }
798
950
 
799
- for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
951
+ for (const auto & layer : layers) {
952
+ const uint32_t il = layer.il;
953
+
800
954
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
801
955
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
802
956
 
803
- ggml_tensor * view_k_src = ggml_view_2d(ctx, k_l[il],
957
+ ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
804
958
  n_embd_k_gqa, nm,
805
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
806
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
959
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
960
+ ggml_row_size(layer.k->type, n_embd_k_gqa*i));
807
961
 
808
- ggml_tensor * view_k_dst = ggml_view_2d(ctx, k_l[il],
962
+ ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
809
963
  n_embd_k_gqa, nm,
810
- ggml_row_size(k_l[il]->type, n_embd_k_gqa),
811
- ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
964
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
965
+ ggml_row_size(layer.k->type, n_embd_k_gqa*id));
812
966
 
813
967
  ggml_tensor * view_v_src;
814
968
  ggml_tensor * view_v_dst;
815
969
 
816
970
  if (cparams.flash_attn) {
817
971
  // NOTE: the V cache is not transposed when using flash attention
818
- view_v_src = ggml_view_2d(ctx, v_l[il],
972
+ view_v_src = ggml_view_2d(ctx, layer.v,
819
973
  n_embd_v_gqa, nm,
820
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
821
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
974
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
975
+ ggml_row_size(layer.v->type, n_embd_v_gqa*i));
822
976
 
823
- view_v_dst = ggml_view_2d(ctx, v_l[il],
977
+ view_v_dst = ggml_view_2d(ctx, layer.v,
824
978
  n_embd_v_gqa, nm,
825
- ggml_row_size(v_l[il]->type, n_embd_v_gqa),
826
- ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
979
+ ggml_row_size(layer.v->type, n_embd_v_gqa),
980
+ ggml_row_size(layer.v->type, n_embd_v_gqa*id));
827
981
  } else {
828
- view_v_src = ggml_view_2d(ctx, v_l[il],
982
+ view_v_src = ggml_view_2d(ctx, layer.v,
829
983
  nm, n_embd_v_gqa,
830
- ggml_row_size(v_l[il]->type, size),
831
- ggml_row_size(v_l[il]->type, i));
984
+ ggml_row_size(layer.v->type, cells.size()),
985
+ ggml_row_size(layer.v->type, i));
832
986
 
833
- view_v_dst = ggml_view_2d(ctx, v_l[il],
987
+ view_v_dst = ggml_view_2d(ctx, layer.v,
834
988
  nm, n_embd_v_gqa,
835
- ggml_row_size(v_l[il]->type, size),
836
- ggml_row_size(v_l[il]->type, id));
989
+ ggml_row_size(layer.v->type, cells.size()),
990
+ ggml_row_size(layer.v->type, id));
837
991
  }
838
992
 
839
993
  ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
@@ -850,10 +1004,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
850
1004
  }
851
1005
 
852
1006
  bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
853
- const uint32_t n_layer = hparams.n_layer;
1007
+ const uint32_t n_layer = layers.size();
854
1008
 
855
- const uint32_t n_kv = cell_max();
856
- const uint32_t n_used = used;
1009
+ const uint32_t n_kv = cells.used_max_p1();
1010
+ const uint32_t n_used = cells.get_used();
857
1011
 
858
1012
  assert(n_used <= n_kv);
859
1013
 
@@ -881,9 +1035,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
881
1035
  ids.resize(n_kv, n_kv);
882
1036
 
883
1037
  for (uint32_t i0 = 0; i0 < n_used; ++i0) {
884
- const auto & cell0 = cells[i0];
885
-
886
- if (!cell0.is_empty()) {
1038
+ if (!cells.is_empty(i0)) {
887
1039
  ids[i0] = i0;
888
1040
 
889
1041
  continue;
@@ -894,7 +1046,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
894
1046
  uint32_t nh = 1;
895
1047
 
896
1048
  // determine the size of the hole
897
- while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
1049
+ while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
898
1050
  nh++;
899
1051
  }
900
1052
 
@@ -903,9 +1055,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
903
1055
 
904
1056
  // starting from the end, find nh non-empty cells
905
1057
  for (; is > i0; --is) {
906
- const auto & cell1 = cells[is];
907
-
908
- if (cell1.is_empty() || ids[is] != n_kv) {
1058
+ if (cells.is_empty(is) || ids[is] != n_kv) {
909
1059
  continue;
910
1060
  }
911
1061
 
@@ -932,9 +1082,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
932
1082
 
933
1083
  // go back and move the nf cells to the hole
934
1084
  for (; i1 < n_kv; ++i1) {
935
- auto & cell1 = cells[i1];
936
-
937
- if (cell1.is_empty() || ids[i1] != n_kv) {
1085
+ if (cells.is_empty(i1) || ids[i1] != n_kv) {
938
1086
  if (n_moves == max_moves) {
939
1087
  stop = true;
940
1088
  break;
@@ -948,10 +1096,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
948
1096
  ids[i1] = i0 + nf;
949
1097
 
950
1098
  // move the cell meta data
951
- cells[i0 + nf] = cell1;
1099
+ cells.mv(i1, i0 + nf);
952
1100
 
953
- // clear the old cell and move the head there
954
- cell1 = kv_cell();
955
1101
  head = n_used;
956
1102
 
957
1103
  if (!cont) {
@@ -986,16 +1132,30 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
986
1132
  return true;
987
1133
  }
988
1134
 
989
- uint32_t llama_kv_cache_unified::cell_max() const {
990
- for (uint32_t i = size; i > 0; --i) {
991
- const kv_cell & cell = cells[i - 1];
1135
+ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
1136
+ assert(p0 >= 0 && p1 >= 0);
992
1137
 
993
- if (cell.pos >= 0 && !cell.is_empty()) {
994
- return i;
995
- }
1138
+ switch (swa_type) {
1139
+ case LLAMA_SWA_TYPE_NONE:
1140
+ {
1141
+ } break;
1142
+ case LLAMA_SWA_TYPE_STANDARD:
1143
+ {
1144
+ if (p1 - p0 >= (int32_t) n_swa) {
1145
+ return true;
1146
+ }
1147
+ } break;
1148
+ case LLAMA_SWA_TYPE_CHUNKED:
1149
+ {
1150
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1151
+
1152
+ if (p0 < pos_chunk_start) {
1153
+ return true;
1154
+ }
1155
+ } break;
996
1156
  }
997
1157
 
998
- return 0;
1158
+ return false;
999
1159
  }
1000
1160
 
1001
1161
  void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
@@ -1004,23 +1164,24 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
1004
1164
 
1005
1165
  // Count the number of cells with the specified seq_id
1006
1166
  // Find all the ranges of cells with this seq id (or all, when -1)
1007
- uint32_t cell_range_begin = size;
1008
- for (uint32_t i = 0; i < size; ++i) {
1009
- const auto & cell = cells[i];
1010
- if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
1167
+ uint32_t cell_range_begin = cells.size();
1168
+
1169
+ for (uint32_t i = 0; i < cells.size(); ++i) {
1170
+ if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
1011
1171
  ++cell_count;
1012
- if (cell_range_begin == size) {
1172
+ if (cell_range_begin == cells.size()) {
1013
1173
  cell_range_begin = i;
1014
1174
  }
1015
1175
  } else {
1016
- if (cell_range_begin != size) {
1176
+ if (cell_range_begin != cells.size()) {
1017
1177
  cell_ranges.emplace_back(cell_range_begin, i);
1018
- cell_range_begin = size;
1178
+ cell_range_begin = cells.size();
1019
1179
  }
1020
1180
  }
1021
1181
  }
1022
- if (cell_range_begin != size) {
1023
- cell_ranges.emplace_back(cell_range_begin, size);
1182
+
1183
+ if (cell_range_begin != cells.size()) {
1184
+ cell_ranges.emplace_back(cell_range_begin, cells.size());
1024
1185
  }
1025
1186
 
1026
1187
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -1057,17 +1218,24 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
1057
1218
  void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
1058
1219
  for (const auto & range : cell_ranges) {
1059
1220
  for (uint32_t i = range.first; i < range.second; ++i) {
1060
- const auto & cell = cells[i];
1061
- const llama_pos pos = cell.pos;
1062
- const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
1221
+ std::vector<llama_seq_id> seq_ids;
1222
+
1223
+ for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
1224
+ if (cur == seq_id || seq_id == -1) {
1225
+ if (cells.seq_has(i, cur)) {
1226
+ seq_ids.push_back(cur);
1227
+ }
1228
+ }
1229
+ }
1230
+
1231
+ const llama_pos pos = cells.pos_get(i);
1232
+ const uint32_t n_seq_id = seq_ids.size();
1063
1233
 
1064
1234
  io.write(&pos, sizeof(pos));
1065
1235
  io.write(&n_seq_id, sizeof(n_seq_id));
1066
1236
 
1067
- if (n_seq_id) {
1068
- for (auto seq_id : cell.seq_id) {
1069
- io.write(&seq_id, sizeof(seq_id));
1070
- }
1237
+ for (const auto & seq_id : seq_ids) {
1238
+ io.write(&seq_id, sizeof(seq_id));
1071
1239
  }
1072
1240
  }
1073
1241
  }
@@ -1075,7 +1243,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::
1075
1243
 
1076
1244
  void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
1077
1245
  const uint32_t v_trans = this->v_trans ? 1 : 0;
1078
- const uint32_t n_layer = hparams.n_layer;
1246
+ const uint32_t n_layer = layers.size();
1079
1247
 
1080
1248
  io.write(&v_trans, sizeof(v_trans));
1081
1249
  io.write(&n_layer, sizeof(n_layer));
@@ -1084,56 +1252,63 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1084
1252
 
1085
1253
  // Iterate and write all the keys first, each row is a cell
1086
1254
  // Get whole range at a time
1087
- for (uint32_t il = 0; il < n_layer; ++il) {
1255
+ for (const auto & layer : layers) {
1256
+ const uint32_t il = layer.il;
1257
+
1088
1258
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1089
1259
 
1090
1260
  // Write key type
1091
- const int32_t k_type_i = (int32_t)k_l[il]->type;
1261
+ const int32_t k_type_i = (int32_t)layer.k->type;
1092
1262
  io.write(&k_type_i, sizeof(k_type_i));
1093
1263
 
1094
1264
  // Write row size of key
1095
- const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1265
+ const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
1096
1266
  io.write(&k_size_row, sizeof(k_size_row));
1097
1267
 
1098
1268
  // Read each range of cells of k_size length each into tmp_buf and write out
1099
1269
  for (const auto & range : cell_ranges) {
1100
1270
  const size_t range_size = range.second - range.first;
1101
1271
  const size_t buf_size = range_size * k_size_row;
1102
- io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
1272
+ io.write_tensor(layer.k, range.first * k_size_row, buf_size);
1103
1273
  }
1104
1274
  }
1105
1275
 
1106
1276
  if (!v_trans) {
1107
- for (uint32_t il = 0; il < n_layer; ++il) {
1277
+ for (const auto & layer : layers) {
1278
+ const uint32_t il = layer.il;
1279
+
1108
1280
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1109
1281
 
1110
1282
  // Write value type
1111
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1283
+ const int32_t v_type_i = (int32_t)layer.v->type;
1112
1284
  io.write(&v_type_i, sizeof(v_type_i));
1113
1285
 
1114
1286
  // Write row size of value
1115
- const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1287
+ const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
1116
1288
  io.write(&v_size_row, sizeof(v_size_row));
1117
1289
 
1118
1290
  // Read each range of cells of v_size length each into tmp_buf and write out
1119
1291
  for (const auto & range : cell_ranges) {
1120
1292
  const size_t range_size = range.second - range.first;
1121
1293
  const size_t buf_size = range_size * v_size_row;
1122
- io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
1294
+ io.write_tensor(layer.v, range.first * v_size_row, buf_size);
1123
1295
  }
1124
1296
  }
1125
1297
  } else {
1126
1298
  // When v is transposed, we also need the element size and get the element ranges from each row
1127
- const uint32_t kv_size = size;
1128
- for (uint32_t il = 0; il < n_layer; ++il) {
1299
+ const uint32_t kv_size = cells.size();
1300
+
1301
+ for (const auto & layer : layers) {
1302
+ const uint32_t il = layer.il;
1303
+
1129
1304
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1130
1305
 
1131
1306
  // Write value type
1132
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1307
+ const int32_t v_type_i = (int32_t)layer.v->type;
1133
1308
  io.write(&v_type_i, sizeof(v_type_i));
1134
1309
 
1135
1310
  // Write element size
1136
- const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
1311
+ const uint32_t v_size_el = ggml_type_size(layer.v->type);
1137
1312
  io.write(&v_size_el, sizeof(v_size_el));
1138
1313
 
1139
1314
  // Write GQA embedding size
@@ -1146,7 +1321,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1146
1321
  const size_t range_size = range.second - range.first;
1147
1322
  const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1148
1323
  const size_t buf_size = range_size * v_size_el;
1149
- io.write_tensor(v_l[il], src_offset, buf_size);
1324
+ io.write_tensor(layer.v, src_offset, buf_size);
1150
1325
  }
1151
1326
  }
1152
1327
  }
@@ -1163,8 +1338,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1163
1338
  llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
1164
1339
 
1165
1340
  batch.n_tokens = cell_count;
1166
- batch.n_seq_tokens = cell_count;
1167
- batch.n_seqs = 1;
1168
1341
 
1169
1342
  for (uint32_t i = 0; i < cell_count; ++i) {
1170
1343
  llama_pos pos;
@@ -1173,32 +1346,40 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1173
1346
  io.read_to(&pos, sizeof(pos));
1174
1347
  io.read_to(&n_seq_id, sizeof(n_seq_id));
1175
1348
 
1176
- if (n_seq_id != 0) {
1349
+ if (n_seq_id != 1) {
1177
1350
  LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1178
1351
  return false;
1179
1352
  }
1180
1353
 
1181
- batch.pos[i] = pos;
1354
+ // read the sequence id, but directly discard it - we will use dest_seq_id instead
1355
+ {
1356
+ llama_seq_id seq_id;
1357
+ io.read_to(&seq_id, sizeof(seq_id));
1358
+ }
1359
+
1360
+ batch.pos[i] = pos;
1361
+ batch.n_seq_id[i] = n_seq_id;
1362
+ batch.seq_id[i] = &dest_seq_id;
1182
1363
  }
1183
- batch.n_seq_id[0] = 1;
1184
- batch.seq_id[0] = &dest_seq_id;
1364
+
1185
1365
  if (!find_slot(batch)) {
1186
1366
  LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1187
1367
  return false;
1188
1368
  }
1369
+
1189
1370
  commit();
1190
1371
 
1191
1372
  // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
1192
1373
  // Assume that this is one contiguous block of cells
1193
- GGML_ASSERT(head + cell_count <= size);
1194
- GGML_ASSERT(cells[head].pos == batch.pos[0]);
1195
- GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
1196
- GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
1197
- GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
1374
+ GGML_ASSERT(head + cell_count <= cells.size());
1375
+ GGML_ASSERT(cells.pos_get(head) == batch.pos[0]);
1376
+ GGML_ASSERT(cells.pos_get(head + cell_count - 1) == batch.pos[cell_count - 1]);
1377
+ GGML_ASSERT(cells.seq_has(head, dest_seq_id));
1378
+ GGML_ASSERT(cells.seq_has(head + cell_count - 1, dest_seq_id));
1198
1379
  } else {
1199
1380
  // whole KV cache restore
1200
1381
 
1201
- if (cell_count > size) {
1382
+ if (cell_count > cells.size()) {
1202
1383
  LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1203
1384
  return false;
1204
1385
  }
@@ -1206,34 +1387,28 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1206
1387
  clear();
1207
1388
 
1208
1389
  for (uint32_t i = 0; i < cell_count; ++i) {
1209
- kv_cell & cell = cells[i];
1210
-
1211
1390
  llama_pos pos;
1212
1391
  uint32_t n_seq_id;
1213
1392
 
1214
1393
  io.read_to(&pos, sizeof(pos));
1215
1394
  io.read_to(&n_seq_id, sizeof(n_seq_id));
1216
1395
 
1217
- cell.pos = pos;
1396
+ cells.pos_set(i, pos);
1218
1397
 
1219
1398
  for (uint32_t j = 0; j < n_seq_id; ++j) {
1220
1399
  llama_seq_id seq_id;
1221
1400
  io.read_to(&seq_id, sizeof(seq_id));
1222
1401
 
1223
- // TODO: llama_kv_cache_unified should have a notion of max sequences
1224
- //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
1225
- if (seq_id < 0) {
1226
- //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
1227
- LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
1402
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
1403
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
1228
1404
  return false;
1229
1405
  }
1230
1406
 
1231
- cell.seq_id.insert(seq_id);
1407
+ cells.seq_add(i, seq_id);
1232
1408
  }
1233
1409
  }
1234
1410
 
1235
1411
  head = 0;
1236
- used = cell_count;
1237
1412
  }
1238
1413
 
1239
1414
  return true;
@@ -1242,15 +1417,16 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
1242
1417
  bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
1243
1418
  uint32_t v_trans;
1244
1419
  uint32_t n_layer;
1420
+
1245
1421
  io.read_to(&v_trans, sizeof(v_trans));
1246
1422
  io.read_to(&n_layer, sizeof(n_layer));
1247
1423
 
1248
- if (n_layer != hparams.n_layer) {
1249
- LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
1424
+ if (n_layer != layers.size()) {
1425
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
1250
1426
  return false;
1251
1427
  }
1252
- if (cell_count > size) {
1253
- LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
1428
+ if (cell_count > cells.size()) {
1429
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
1254
1430
  return false;
1255
1431
  }
1256
1432
  if (this->v_trans != (bool) v_trans) {
@@ -1259,13 +1435,15 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1259
1435
  }
1260
1436
 
1261
1437
  // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1262
- for (uint32_t il = 0; il < n_layer; ++il) {
1438
+ for (const auto & layer : layers) {
1439
+ const uint32_t il = layer.il;
1440
+
1263
1441
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
1264
1442
 
1265
1443
  // Read type of key
1266
1444
  int32_t k_type_i_ref;
1267
1445
  io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1268
- const int32_t k_type_i = (int32_t) k_l[il]->type;
1446
+ const int32_t k_type_i = (int32_t) layer.k->type;
1269
1447
  if (k_type_i != k_type_i_ref) {
1270
1448
  LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1271
1449
  return false;
@@ -1274,7 +1452,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1274
1452
  // Read row size of key
1275
1453
  uint64_t k_size_row_ref;
1276
1454
  io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1277
- const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
1455
+ const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
1278
1456
  if (k_size_row != k_size_row_ref) {
1279
1457
  LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1280
1458
  return false;
@@ -1282,18 +1460,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1282
1460
 
1283
1461
  if (cell_count) {
1284
1462
  // Read and set the keys for the whole cell range
1285
- ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1463
+ ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
1286
1464
  }
1287
1465
  }
1288
1466
 
1289
1467
  if (!this->v_trans) {
1290
- for (uint32_t il = 0; il < n_layer; ++il) {
1468
+ for (const auto & layer : layers) {
1469
+ const uint32_t il = layer.il;
1470
+
1291
1471
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1292
1472
 
1293
1473
  // Read type of value
1294
1474
  int32_t v_type_i_ref;
1295
1475
  io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1296
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1476
+ const int32_t v_type_i = (int32_t)layer.v->type;
1297
1477
  if (v_type_i != v_type_i_ref) {
1298
1478
  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1299
1479
  return false;
@@ -1302,7 +1482,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1302
1482
  // Read row size of value
1303
1483
  uint64_t v_size_row_ref;
1304
1484
  io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1305
- const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
1485
+ const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
1306
1486
  if (v_size_row != v_size_row_ref) {
1307
1487
  LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1308
1488
  return false;
@@ -1310,18 +1490,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1310
1490
 
1311
1491
  if (cell_count) {
1312
1492
  // Read and set the values for the whole cell range
1313
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1493
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
1314
1494
  }
1315
1495
  }
1316
1496
  } else {
1317
1497
  // For each layer, read the values for each cell (transposed)
1318
- for (uint32_t il = 0; il < n_layer; ++il) {
1498
+ for (const auto & layer : layers) {
1499
+ const uint32_t il = layer.il;
1500
+
1319
1501
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
1320
1502
 
1321
1503
  // Read type of value
1322
1504
  int32_t v_type_i_ref;
1323
1505
  io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1324
- const int32_t v_type_i = (int32_t)v_l[il]->type;
1506
+ const int32_t v_type_i = (int32_t)layer.v->type;
1325
1507
  if (v_type_i != v_type_i_ref) {
1326
1508
  LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1327
1509
  return false;
@@ -1330,7 +1512,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1330
1512
  // Read element size of value
1331
1513
  uint32_t v_size_el_ref;
1332
1514
  io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1333
- const size_t v_size_el = ggml_type_size(v_l[il]->type);
1515
+ const size_t v_size_el = ggml_type_size(layer.v->type);
1334
1516
  if (v_size_el != v_size_el_ref) {
1335
1517
  LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1336
1518
  return false;
@@ -1347,8 +1529,8 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1347
1529
  if (cell_count) {
1348
1530
  // For each row in the transposed matrix, read the values for the whole cell range
1349
1531
  for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1350
- const size_t dst_offset = (head + j * size) * v_size_el;
1351
- ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1532
+ const size_t dst_offset = (head + j * cells.size()) * v_size_el;
1533
+ ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1352
1534
  }
1353
1535
  }
1354
1536
  }
@@ -1357,6 +1539,193 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1357
1539
  return true;
1358
1540
  }
1359
1541
 
1542
+ //
1543
+ // llama_kv_cache_unified_iswa
1544
+ //
1545
+
1546
+ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
1547
+ const llama_model & model,
1548
+ ggml_type type_k,
1549
+ ggml_type type_v,
1550
+ bool v_trans,
1551
+ bool offload,
1552
+ bool swa_full,
1553
+ uint32_t kv_size,
1554
+ uint32_t n_seq_max,
1555
+ uint32_t n_batch,
1556
+ uint32_t n_pad) : hparams(model.hparams) {
1557
+ llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
1558
+ llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
1559
+
1560
+ const uint32_t size_base = kv_size;
1561
+
1562
+ uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
1563
+
1564
+ // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
1565
+ if (swa_full) {
1566
+ LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
1567
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
1568
+
1569
+ size_swa = size_base;
1570
+ do_prune = false;
1571
+ }
1572
+
1573
+ LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
1574
+
1575
+ kv_base = std::make_unique<llama_kv_cache_unified>(
1576
+ model, std::move(filter_base), type_k, type_v,
1577
+ v_trans, offload, size_base, n_seq_max, n_pad,
1578
+ 0, LLAMA_SWA_TYPE_NONE);
1579
+
1580
+ LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
1581
+
1582
+ kv_swa = std::make_unique<llama_kv_cache_unified>(
1583
+ model, std::move(filter_swa), type_k, type_v,
1584
+ v_trans, offload, size_swa, n_seq_max, n_pad,
1585
+ hparams.n_swa, hparams.swa_type);
1586
+ }
1587
+
1588
+ void llama_kv_cache_unified_iswa::clear() {
1589
+ kv_base->clear();
1590
+ kv_swa ->clear();
1591
+ }
1592
+
1593
+ bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
1594
+ bool res = true;
1595
+
1596
+ res = res & kv_base->seq_rm(seq_id, p0, p1);
1597
+ res = res & kv_swa ->seq_rm(seq_id, p0, p1);
1598
+
1599
+ return res;
1600
+ }
1601
+
1602
+ void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
1603
+ kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1604
+ kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
1605
+ }
1606
+
1607
+ void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
1608
+ kv_base->seq_keep(seq_id);
1609
+ kv_swa ->seq_keep(seq_id);
1610
+ }
1611
+
1612
+ void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
1613
+ kv_base->seq_add(seq_id, p0, p1, shift);
1614
+ kv_swa ->seq_add(seq_id, p0, p1, shift);
1615
+ }
1616
+
1617
+ void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
1618
+ kv_base->seq_div(seq_id, p0, p1, d);
1619
+ kv_swa ->seq_div(seq_id, p0, p1, d);
1620
+ }
1621
+
1622
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
1623
+ // the base cache is a superset of the SWA cache, so we can just check the SWA cache
1624
+ return kv_swa->seq_pos_min(seq_id);
1625
+ }
1626
+
1627
+ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
1628
+ return kv_swa->seq_pos_max(seq_id);
1629
+ }
1630
+
1631
+ void llama_kv_cache_unified_iswa::restore() {
1632
+ kv_base->restore();
1633
+ kv_swa ->restore();
1634
+ }
1635
+
1636
+ void llama_kv_cache_unified_iswa::commit() {
1637
+ kv_base->commit();
1638
+ kv_swa ->commit();
1639
+
1640
+ // slide the attention window, forgetting/pruning old tokens that are outside the window
1641
+ if (do_prune) {
1642
+ for (const auto & [seq_id, entry] : pending.pos) {
1643
+ kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
1644
+ }
1645
+
1646
+ }
1647
+
1648
+ pending.clear();
1649
+ }
1650
+
1651
+ bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
1652
+ bool res = true;
1653
+
1654
+ res = res & kv_base->update(lctx);
1655
+ res = res & kv_swa ->update(lctx);
1656
+
1657
+ return res;
1658
+ }
1659
+
1660
+ void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
1661
+ kv_base->defrag_sched(thold);
1662
+ kv_swa ->defrag_sched(thold);
1663
+ }
1664
+
1665
+ void llama_kv_cache_unified_iswa::set_full() {
1666
+ kv_base->set_full();
1667
+ kv_swa ->set_full();
1668
+ }
1669
+
1670
+ llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
1671
+ pending.clear();
1672
+
1673
+ if (do_prune) {
1674
+ for (int i = 0; i < batch.n_tokens; ++i) {
1675
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
1676
+ const llama_seq_id seq_id = batch.seq_id[i][s];
1677
+ const llama_pos pos = batch.pos[i];
1678
+
1679
+ if (pending.pos.find(seq_id) == pending.pos.end()) {
1680
+ pending.pos[seq_id].pmin = pos;
1681
+ pending.pos[seq_id].pmax = pos;
1682
+ } else {
1683
+ pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
1684
+ pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
1685
+ }
1686
+ }
1687
+ }
1688
+ }
1689
+
1690
+ return llama_sbatch(batch, hparams.n_embd, true, logits_all);
1691
+ }
1692
+
1693
+ llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
1694
+ GGML_UNUSED(embd_pooled);
1695
+ return sbatch.split_simple(n_ubatch);
1696
+ }
1697
+
1698
+ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
1699
+ bool res = true;
1700
+
1701
+ res = res & kv_base->find_slot(batch);
1702
+ res = res & kv_swa ->find_slot(batch);
1703
+
1704
+ return res;
1705
+ }
1706
+
1707
+ bool llama_kv_cache_unified_iswa::get_can_shift() const {
1708
+ return kv_base->get_size() == kv_swa->get_size();
1709
+ }
1710
+
1711
+ void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
1712
+ kv_base->state_write(io, seq_id);
1713
+ kv_swa ->state_write(io, seq_id);
1714
+ }
1715
+
1716
+ void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
1717
+ kv_base->state_read(io, seq_id);
1718
+ kv_swa ->state_read(io, seq_id);
1719
+ }
1720
+
1721
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
1722
+ return kv_base.get();
1723
+ }
1724
+
1725
+ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
1726
+ return kv_swa.get();
1727
+ }
1728
+
1360
1729
  //
1361
1730
  // llama_kv_cache_recurrent
1362
1731
  //
@@ -1366,19 +1735,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
1366
1735
  ggml_type type_k,
1367
1736
  ggml_type type_v,
1368
1737
  bool offload,
1369
- uint32_t kv_size) : hparams(model.hparams) {
1738
+ uint32_t kv_size,
1739
+ uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
1370
1740
  const int32_t n_layer = hparams.n_layer;
1371
1741
 
1372
- LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
1373
- __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
1742
+ LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
1743
+ __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
1374
1744
 
1375
1745
  head = 0;
1376
1746
  size = kv_size;
1377
1747
  used = 0;
1378
1748
 
1379
- this->type_k = type_k;
1380
- this->type_v = type_v;
1381
-
1382
1749
  cells.clear();
1383
1750
  cells.resize(kv_size);
1384
1751
 
@@ -1616,8 +1983,8 @@ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
1616
1983
  }
1617
1984
  }
1618
1985
 
1619
- void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
1620
- if (delta == 0) {
1986
+ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
1987
+ if (shift == 0) {
1621
1988
  return;
1622
1989
  }
1623
1990
 
@@ -1640,7 +2007,7 @@ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_
1640
2007
  if (tail_id >= 0) {
1641
2008
  kv_cell & cell = cells[tail_id];
1642
2009
  if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
1643
- cell.pos += delta;
2010
+ cell.pos += shift;
1644
2011
  }
1645
2012
  }
1646
2013
  }
@@ -1676,8 +2043,24 @@ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_
1676
2043
  }
1677
2044
  }
1678
2045
 
2046
+ llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
2047
+ llama_pos result = std::numeric_limits<llama_pos>::max();
2048
+
2049
+ for (uint32_t i = 0; i < size; ++i) {
2050
+ if (cells[i].has_seq_id(seq_id)) {
2051
+ result = std::min(result, cells[i].pos);
2052
+ }
2053
+ }
2054
+
2055
+ if (result == std::numeric_limits<llama_pos>::max()) {
2056
+ result = -1;
2057
+ }
2058
+
2059
+ return result;
2060
+ }
2061
+
1679
2062
  llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
1680
- llama_pos result = 0;
2063
+ llama_pos result = -1;
1681
2064
 
1682
2065
  for (uint32_t i = 0; i < size; ++i) {
1683
2066
  if (cells[i].has_seq_id(seq_id)) {
@@ -1700,8 +2083,8 @@ void llama_kv_cache_recurrent::commit() {
1700
2083
  pending.ranges.clear();
1701
2084
  }
1702
2085
 
1703
- bool llama_kv_cache_recurrent::update(llama_context & lctx) {
1704
- GGML_UNUSED(lctx);
2086
+ bool llama_kv_cache_recurrent::update(llama_context & ctx) {
2087
+ GGML_UNUSED(ctx);
1705
2088
  return false;
1706
2089
  }
1707
2090
 
@@ -1712,6 +2095,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
1712
2095
 
1713
2096
  void llama_kv_cache_recurrent::set_full() {
1714
2097
  n = size;
2098
+ head = 0;
1715
2099
  }
1716
2100
 
1717
2101
  llama_sbatch llama_kv_cache_recurrent::sbatch_init(
@@ -1761,7 +2145,7 @@ bool llama_kv_cache_recurrent::find_slot(
1761
2145
  if (seq_id < 0 || (uint32_t) seq_id >= size) {
1762
2146
  // too big seq_id
1763
2147
  // TODO: would it be possible to resize the cache instead?
1764
- LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
2148
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
1765
2149
  return false;
1766
2150
  }
1767
2151
  if (j > 0) {
@@ -1904,29 +2288,6 @@ bool llama_kv_cache_recurrent::find_slot(
1904
2288
  return n >= n_seqs;
1905
2289
  }
1906
2290
 
1907
- int32_t llama_kv_cache_recurrent::get_n_tokens() const {
1908
- int32_t result = 0;
1909
-
1910
- for (uint32_t i = 0; i < size; i++) {
1911
- result += cells[i].seq_id.size();
1912
- }
1913
-
1914
- return result;
1915
- }
1916
-
1917
- int32_t llama_kv_cache_recurrent::get_used_cells() const {
1918
- return used;
1919
- }
1920
-
1921
- llama_pos llama_kv_cache_recurrent::get_pos_max() const {
1922
- llama_pos pos_max = -1;
1923
- for (const auto & cell : cells) {
1924
- pos_max = std::max(pos_max, cell.pos);
1925
- }
1926
-
1927
- return pos_max;
1928
- }
1929
-
1930
2291
  bool llama_kv_cache_recurrent::get_can_shift() const {
1931
2292
  return false;
1932
2293
  }
@@ -2055,6 +2416,7 @@ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq
2055
2416
  io.read_to(&cell_count, sizeof(cell_count));
2056
2417
 
2057
2418
  bool res = true;
2419
+
2058
2420
  res = res && state_read_meta(io, cell_count, seq_id);
2059
2421
  res = res && state_read_data(io, cell_count);
2060
2422
 
@@ -2383,104 +2745,3 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
2383
2745
 
2384
2746
  return true;
2385
2747
  }
2386
-
2387
- //
2388
- // kv cache view
2389
- //
2390
-
2391
- llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
2392
- llama_kv_cache_view result = {
2393
- /*.n_cells = */ 0,
2394
- /*.n_seq_max = */ n_seq_max,
2395
- /*.token_count = */ 0,
2396
- /*.used_cells = */ kv.get_used_cells(),
2397
- /*.max_contiguous = */ 0,
2398
- /*.max_contiguous_idx = */ -1,
2399
- /*.cells = */ nullptr,
2400
- /*.cells_sequences = */ nullptr,
2401
- };
2402
-
2403
- return result;
2404
- }
2405
-
2406
- void llama_kv_cache_view_free(llama_kv_cache_view * view) {
2407
- if (view->cells != nullptr) {
2408
- free(view->cells);
2409
- view->cells = nullptr;
2410
- }
2411
- if (view->cells_sequences != nullptr) {
2412
- free(view->cells_sequences);
2413
- view->cells_sequences = nullptr;
2414
- }
2415
- }
2416
-
2417
- void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
2418
- // TODO: rework this in the future, for now quick hack
2419
- const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
2420
- if (kvu == nullptr) {
2421
- LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
2422
- return;
2423
- }
2424
-
2425
- if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
2426
- view->n_cells = int32_t(kvu->size);
2427
- void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
2428
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
2429
- view->cells = (llama_kv_cache_view_cell *)p;
2430
- p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
2431
- GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
2432
- view->cells_sequences = (llama_seq_id *)p;
2433
- }
2434
-
2435
- const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
2436
- llama_kv_cache_view_cell * c_curr = view->cells;
2437
- llama_seq_id * cs_curr = view->cells_sequences;
2438
- int32_t used_cells = 0;
2439
- int32_t token_count = 0;
2440
- int32_t curr_contig_idx = -1;
2441
- uint32_t max_contig = 0;
2442
- int32_t max_contig_idx = -1;
2443
-
2444
- for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
2445
- const size_t curr_size = kv_cells[i].seq_id.size();
2446
- token_count += curr_size;
2447
- c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
2448
-
2449
- if (curr_size > 0) {
2450
- if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
2451
- max_contig = i - curr_contig_idx;
2452
- max_contig_idx = curr_contig_idx;
2453
- }
2454
- curr_contig_idx = -1;
2455
- } else if (curr_contig_idx < 0) {
2456
- curr_contig_idx = i;
2457
- }
2458
-
2459
- int seq_idx = 0;
2460
- for (const llama_seq_id it : kv_cells[i].seq_id) {
2461
- if (seq_idx >= view->n_seq_max) {
2462
- break;
2463
- }
2464
- cs_curr[seq_idx] = it;
2465
- seq_idx++;
2466
- }
2467
- if (seq_idx != 0) {
2468
- used_cells++;
2469
- }
2470
- for (; seq_idx < view->n_seq_max; seq_idx++) {
2471
- cs_curr[seq_idx] = -1;
2472
- }
2473
- }
2474
- if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
2475
- max_contig_idx = curr_contig_idx;
2476
- max_contig = kv_cells.size() - curr_contig_idx;
2477
- }
2478
- view->max_contiguous = max_contig;
2479
- view->max_contiguous_idx = max_contig_idx;
2480
- view->token_count = token_count;
2481
- view->used_cells = used_cells;
2482
- if (uint32_t(used_cells) != kvu->used) {
2483
- LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
2484
- __func__, kvu->used, used_cells);
2485
- }
2486
- }