whispercpp 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (963) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -43
  3. data/ext/extconf.rb +2 -2
  4. data/ext/ruby_whisper.c +14 -2
  5. data/ext/ruby_whisper.h +39 -0
  6. data/ext/ruby_whisper_context.c +22 -22
  7. data/ext/ruby_whisper_model.c +12 -12
  8. data/ext/ruby_whisper_params.c +79 -25
  9. data/ext/ruby_whisper_segment.c +84 -19
  10. data/ext/ruby_whisper_token.c +351 -0
  11. data/ext/ruby_whisper_transcribe.cpp +1 -1
  12. data/ext/ruby_whisper_vad_context.c +75 -0
  13. data/ext/ruby_whisper_vad_context_detect.cpp +50 -0
  14. data/ext/ruby_whisper_vad_segment.c +139 -0
  15. data/ext/ruby_whisper_vad_segments.c +106 -0
  16. data/ext/sources/CMakeLists.txt +4 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/cmake/arm64-apple-clang.cmake +16 -0
  19. data/ext/sources/cmake/arm64-windows-llvm.cmake +16 -0
  20. data/ext/sources/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  21. data/ext/sources/cmake/x64-windows-llvm.cmake +5 -0
  22. data/ext/sources/examples/CMakeLists.txt +1 -0
  23. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  24. data/ext/sources/examples/addon.node/index.js +7 -5
  25. data/ext/sources/examples/addon.node/vad-example.js +2 -2
  26. data/ext/sources/examples/bench/bench.cpp +26 -16
  27. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  28. data/ext/sources/examples/cli/cli.cpp +122 -111
  29. data/ext/sources/examples/command/command.cpp +26 -24
  30. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  31. data/ext/sources/examples/common-ggml.cpp +2 -0
  32. data/ext/sources/examples/lsp/CMakeLists.txt +2 -1
  33. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  34. data/ext/sources/examples/quantize/CMakeLists.txt +2 -1
  35. data/ext/sources/examples/server/server.cpp +34 -24
  36. data/ext/sources/examples/server.py +6 -1
  37. data/ext/sources/examples/stream/stream.cpp +4 -2
  38. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  39. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  40. data/ext/sources/examples/talk-llama/CMakeLists.txt +7 -3
  41. data/ext/sources/examples/talk-llama/llama-adapter.cpp +113 -7
  42. data/ext/sources/examples/talk-llama/llama-adapter.h +13 -1
  43. data/ext/sources/examples/talk-llama/llama-arch.cpp +2136 -1491
  44. data/ext/sources/examples/talk-llama/llama-arch.h +125 -3
  45. data/ext/sources/examples/talk-llama/llama-batch.cpp +174 -100
  46. data/ext/sources/examples/talk-llama/llama-batch.h +46 -20
  47. data/ext/sources/examples/talk-llama/llama-chat.cpp +199 -8
  48. data/ext/sources/examples/talk-llama/llama-chat.h +11 -0
  49. data/ext/sources/examples/talk-llama/llama-context.cpp +1213 -413
  50. data/ext/sources/examples/talk-llama/llama-context.h +99 -36
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -4
  52. data/ext/sources/examples/talk-llama/llama-grammar.cpp +288 -53
  53. data/ext/sources/examples/talk-llama/llama-grammar.h +22 -1
  54. data/ext/sources/examples/talk-llama/llama-graph.cpp +883 -294
  55. data/ext/sources/examples/talk-llama/llama-graph.h +361 -161
  56. data/ext/sources/examples/talk-llama/llama-hparams.cpp +144 -6
  57. data/ext/sources/examples/talk-llama/llama-hparams.h +100 -23
  58. data/ext/sources/examples/talk-llama/llama-impl.cpp +7 -3
  59. data/ext/sources/examples/talk-llama/llama-impl.h +3 -1
  60. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +328 -0
  61. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  62. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2100 -0
  63. data/ext/sources/examples/talk-llama/llama-kv-cache.h +373 -27
  64. data/ext/sources/examples/talk-llama/llama-kv-cells.h +124 -30
  65. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +63 -41
  66. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  67. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +77 -35
  68. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +15 -16
  69. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  70. data/ext/sources/examples/talk-llama/llama-mmap.cpp +172 -37
  71. data/ext/sources/examples/talk-llama/llama-mmap.h +8 -3
  72. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +93 -9
  73. data/ext/sources/examples/talk-llama/llama-model-loader.h +9 -2
  74. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +3 -0
  75. data/ext/sources/examples/talk-llama/llama-model.cpp +3369 -10145
  76. data/ext/sources/examples/talk-llama/llama-model.h +104 -12
  77. data/ext/sources/examples/talk-llama/llama-quant.cpp +53 -30
  78. data/ext/sources/examples/talk-llama/llama-sampling.cpp +1520 -324
  79. data/ext/sources/examples/talk-llama/llama-sampling.h +19 -7
  80. data/ext/sources/examples/talk-llama/llama-vocab.cpp +562 -39
  81. data/ext/sources/examples/talk-llama/llama-vocab.h +50 -0
  82. data/ext/sources/examples/talk-llama/llama.cpp +794 -12
  83. data/ext/sources/examples/talk-llama/llama.h +246 -190
  84. data/ext/sources/examples/talk-llama/models/afmoe.cpp +191 -0
  85. data/ext/sources/examples/talk-llama/models/apertus.cpp +125 -0
  86. data/ext/sources/examples/talk-llama/models/arcee.cpp +135 -0
  87. data/ext/sources/examples/talk-llama/models/arctic.cpp +138 -0
  88. data/ext/sources/examples/talk-llama/models/arwkv7.cpp +86 -0
  89. data/ext/sources/examples/talk-llama/models/baichuan.cpp +122 -0
  90. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +144 -0
  91. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +135 -0
  92. data/ext/sources/examples/talk-llama/models/bert.cpp +178 -0
  93. data/ext/sources/examples/talk-llama/models/bitnet.cpp +160 -0
  94. data/ext/sources/examples/talk-llama/models/bloom.cpp +101 -0
  95. data/ext/sources/examples/talk-llama/models/chameleon.cpp +178 -0
  96. data/ext/sources/examples/talk-llama/models/chatglm.cpp +132 -0
  97. data/ext/sources/examples/talk-llama/models/codeshell.cpp +111 -0
  98. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +102 -0
  99. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +134 -0
  100. data/ext/sources/examples/talk-llama/models/command-r.cpp +122 -0
  101. data/ext/sources/examples/talk-llama/models/dbrx.cpp +123 -0
  102. data/ext/sources/examples/talk-llama/models/deci.cpp +135 -0
  103. data/ext/sources/examples/talk-llama/models/deepseek.cpp +144 -0
  104. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +259 -0
  105. data/ext/sources/examples/talk-llama/models/dots1.cpp +134 -0
  106. data/ext/sources/examples/talk-llama/models/dream.cpp +105 -0
  107. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +150 -0
  108. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +110 -0
  109. data/ext/sources/examples/talk-llama/models/exaone.cpp +114 -0
  110. data/ext/sources/examples/talk-llama/models/exaone4.cpp +123 -0
  111. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +113 -0
  112. data/ext/sources/examples/talk-llama/models/falcon.cpp +120 -0
  113. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +116 -0
  114. data/ext/sources/examples/talk-llama/models/gemma.cpp +112 -0
  115. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +128 -0
  116. data/ext/sources/examples/talk-llama/models/gemma3.cpp +155 -0
  117. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +384 -0
  118. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +170 -0
  119. data/ext/sources/examples/talk-llama/models/glm4.cpp +150 -0
  120. data/ext/sources/examples/talk-llama/models/gpt2.cpp +105 -0
  121. data/ext/sources/examples/talk-llama/models/gptneox.cpp +144 -0
  122. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +196 -0
  123. data/ext/sources/examples/talk-llama/models/granite.cpp +211 -0
  124. data/ext/sources/examples/talk-llama/models/graph-context-mamba.cpp +283 -0
  125. data/ext/sources/examples/talk-llama/models/grok.cpp +159 -0
  126. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +141 -0
  127. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +132 -0
  128. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +154 -0
  129. data/ext/sources/examples/talk-llama/models/internlm2.cpp +120 -0
  130. data/ext/sources/examples/talk-llama/models/jais.cpp +86 -0
  131. data/ext/sources/examples/talk-llama/models/jamba.cpp +106 -0
  132. data/ext/sources/examples/talk-llama/models/lfm2.cpp +175 -0
  133. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +122 -0
  134. data/ext/sources/examples/talk-llama/models/llada.cpp +99 -0
  135. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +178 -0
  136. data/ext/sources/examples/talk-llama/models/llama.cpp +168 -0
  137. data/ext/sources/examples/talk-llama/models/maincoder.cpp +117 -0
  138. data/ext/sources/examples/talk-llama/models/mamba.cpp +55 -0
  139. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +123 -0
  140. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +199 -0
  141. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +124 -0
  142. data/ext/sources/examples/talk-llama/models/mistral3.cpp +160 -0
  143. data/ext/sources/examples/talk-llama/models/models.h +569 -0
  144. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +116 -0
  145. data/ext/sources/examples/talk-llama/models/mpt.cpp +126 -0
  146. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +150 -0
  147. data/ext/sources/examples/talk-llama/models/nemotron.cpp +122 -0
  148. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +104 -0
  149. data/ext/sources/examples/talk-llama/models/olmo.cpp +121 -0
  150. data/ext/sources/examples/talk-llama/models/olmo2.cpp +150 -0
  151. data/ext/sources/examples/talk-llama/models/olmoe.cpp +124 -0
  152. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +127 -0
  153. data/ext/sources/examples/talk-llama/models/openelm.cpp +124 -0
  154. data/ext/sources/examples/talk-llama/models/orion.cpp +123 -0
  155. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +121 -0
  156. data/ext/sources/examples/talk-llama/models/phi2.cpp +121 -0
  157. data/ext/sources/examples/talk-llama/models/phi3.cpp +152 -0
  158. data/ext/sources/examples/talk-llama/models/plamo.cpp +110 -0
  159. data/ext/sources/examples/talk-llama/models/plamo2.cpp +316 -0
  160. data/ext/sources/examples/talk-llama/models/plamo3.cpp +128 -0
  161. data/ext/sources/examples/talk-llama/models/plm.cpp +168 -0
  162. data/ext/sources/examples/talk-llama/models/qwen.cpp +108 -0
  163. data/ext/sources/examples/talk-llama/models/qwen2.cpp +126 -0
  164. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +151 -0
  165. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +117 -0
  166. data/ext/sources/examples/talk-llama/models/qwen3.cpp +117 -0
  167. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +124 -0
  168. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +873 -0
  169. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +149 -0
  170. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +141 -0
  171. data/ext/sources/examples/talk-llama/models/refact.cpp +94 -0
  172. data/ext/sources/examples/talk-llama/models/rnd1.cpp +126 -0
  173. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +162 -0
  174. data/ext/sources/examples/talk-llama/models/rwkv6.cpp +94 -0
  175. data/ext/sources/examples/talk-llama/models/rwkv6qwen2.cpp +86 -0
  176. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +135 -0
  177. data/ext/sources/examples/talk-llama/models/rwkv7.cpp +90 -0
  178. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +124 -0
  179. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +126 -0
  180. data/ext/sources/examples/talk-llama/models/smollm3.cpp +128 -0
  181. data/ext/sources/examples/talk-llama/models/stablelm.cpp +146 -0
  182. data/ext/sources/examples/talk-llama/models/starcoder.cpp +100 -0
  183. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +121 -0
  184. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +166 -0
  185. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +96 -0
  186. data/ext/sources/examples/talk-llama/models/wavtokenizer-dec.cpp +149 -0
  187. data/ext/sources/examples/talk-llama/models/xverse.cpp +108 -0
  188. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  189. data/ext/sources/examples/talk-llama/unicode.cpp +309 -16
  190. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  191. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +1 -1
  192. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  193. data/ext/sources/examples/whisper.wasm/index-tmpl.html +18 -17
  194. data/ext/sources/ggml/CMakeLists.txt +135 -79
  195. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  196. data/ext/sources/ggml/include/ggml-alloc.h +9 -0
  197. data/ext/sources/ggml/include/ggml-backend.h +21 -2
  198. data/ext/sources/ggml/include/ggml-cpu.h +2 -1
  199. data/ext/sources/ggml/include/ggml-hexagon.h +19 -0
  200. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  201. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  202. data/ext/sources/ggml/include/ggml-rpc.h +8 -11
  203. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  204. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  205. data/ext/sources/ggml/include/ggml-zendnn.h +22 -0
  206. data/ext/sources/ggml/include/ggml.h +406 -23
  207. data/ext/sources/ggml/src/CMakeLists.txt +99 -13
  208. data/ext/sources/ggml/src/ggml-alloc.c +368 -161
  209. data/ext/sources/ggml/src/ggml-backend-impl.h +5 -5
  210. data/ext/sources/ggml/src/ggml-backend-reg.cpp +55 -14
  211. data/ext/sources/ggml/src/ggml-backend.cpp +290 -57
  212. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +17 -3
  213. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +10 -13
  214. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  215. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +59 -45
  216. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +138 -47
  217. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +2586 -1917
  218. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +348 -309
  219. data/ext/sources/ggml/src/ggml-cann/common.h +350 -133
  220. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +894 -625
  221. data/ext/sources/ggml/src/ggml-common.h +17 -0
  222. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +167 -75
  223. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  224. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  225. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +560 -622
  226. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1002 -270
  227. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +107 -587
  228. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  229. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  230. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +373 -486
  231. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  232. data/ext/sources/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  233. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  234. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  235. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  236. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4682 -1660
  237. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +82 -4
  238. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +18 -9
  240. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +263 -111
  241. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +39 -28
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +683 -82
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +38 -43
  244. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +435 -119
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  246. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1234 -1182
  247. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  248. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +2167 -1480
  249. data/ext/sources/ggml/src/ggml-cpu/ops.h +10 -12
  250. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  251. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  252. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1132 -81
  253. data/ext/sources/ggml/src/ggml-cpu/repack.h +36 -0
  254. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +120 -93
  255. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  256. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  257. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  258. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  259. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  260. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  261. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  262. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +7 -0
  263. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +294 -27
  264. data/ext/sources/ggml/src/ggml-cpu/vec.h +606 -48
  265. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +92 -17
  266. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  267. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  268. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +2 -2
  269. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +123 -6
  270. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +16 -0
  271. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  272. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  273. data/ext/sources/ggml/src/ggml-cuda/common.cuh +588 -128
  274. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  275. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  276. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  277. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  278. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +25 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  280. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +335 -485
  281. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +1 -5
  282. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  283. data/ext/sources/ggml/src/ggml-cuda/cumsum.cu +307 -0
  284. data/ext/sources/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  285. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  286. data/ext/sources/ggml/src/ggml-cuda/diag.cu +77 -0
  287. data/ext/sources/ggml/src/ggml-cuda/diag.cuh +5 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +519 -378
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +750 -637
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +1244 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +98 -61
  294. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  295. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +230 -197
  296. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  297. data/ext/sources/ggml/src/ggml-cuda/fill.cu +37 -0
  298. data/ext/sources/ggml/src/ggml-cuda/fill.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  300. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +1557 -294
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  302. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mean.cu +57 -2
  304. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +915 -69
  305. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +171 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +835 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmid.cu +164 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmid.cuh +5 -0
  309. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +109 -67
  310. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1601 -733
  311. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +802 -0
  312. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +12 -0
  313. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +286 -149
  314. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  315. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  316. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  317. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  318. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/pad.cu +86 -32
  320. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  321. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  322. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +163 -10
  323. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +14 -0
  324. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  325. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  326. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/rope.cu +207 -98
  328. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +2 -0
  329. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  330. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +330 -0
  331. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  332. data/ext/sources/ggml/src/ggml-cuda/set.cu +39 -0
  333. data/ext/sources/ggml/src/ggml-cuda/set.cuh +7 -0
  334. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  335. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +325 -61
  337. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  338. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  339. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +14 -12
  340. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +291 -104
  341. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  342. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +96 -0
  407. data/ext/sources/ggml/src/ggml-cuda/top-k.cuh +3 -0
  408. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +351 -0
  409. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +21 -0
  410. data/ext/sources/ggml/src/ggml-cuda/tri.cu +136 -0
  411. data/ext/sources/ggml/src/ggml-cuda/tri.cuh +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  413. data/ext/sources/ggml/src/ggml-cuda/unary.cu +189 -5
  414. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +44 -0
  415. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +248 -6
  416. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  417. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +8 -0
  418. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +70 -37
  419. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +10 -3
  420. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +80 -0
  421. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3151 -0
  422. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +44 -0
  423. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +682 -0
  424. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  425. data/ext/sources/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  426. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +566 -0
  427. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +112 -0
  428. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  429. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.c +63 -0
  430. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-dma.h +157 -0
  431. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +165 -0
  432. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +92 -0
  433. data/ext/sources/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  434. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +94 -0
  435. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +72 -0
  436. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  437. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +1020 -0
  438. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +1353 -0
  439. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +1001 -0
  440. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +2503 -0
  441. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  442. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +487 -0
  443. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +168 -0
  444. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  445. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +287 -0
  446. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  447. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  448. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +454 -0
  449. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +221 -0
  450. data/ext/sources/ggml/src/ggml-hexagon/op-desc.h +153 -0
  451. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +16 -13
  452. data/ext/sources/ggml/src/ggml-impl.h +186 -15
  453. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  454. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  455. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  456. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  457. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +609 -0
  458. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1743 -0
  459. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +273 -0
  460. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1686 -0
  461. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +356 -61
  462. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +4161 -0
  463. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  464. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +724 -0
  465. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +4495 -1876
  466. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +21 -9
  467. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +29 -0
  468. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4005 -427
  469. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  470. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  471. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  472. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  473. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +147 -0
  474. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  475. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +82 -0
  476. data/ext/sources/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  477. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  478. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  479. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  480. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  481. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  482. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  483. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  484. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  485. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  486. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  487. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  488. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +39 -0
  489. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +94 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +33 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  521. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  522. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  523. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  524. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +497 -195
  525. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +48 -3
  526. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +77 -0
  527. data/ext/sources/ggml/src/ggml-sycl/add-id.hpp +8 -0
  528. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +8 -0
  529. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  530. data/ext/sources/ggml/src/ggml-sycl/common.hpp +117 -15
  531. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +50 -30
  532. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  533. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +200 -99
  534. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  535. data/ext/sources/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  536. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -309
  537. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  538. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  539. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  540. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +77 -34
  541. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +397 -314
  542. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +12 -2
  543. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  544. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  545. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +643 -413
  546. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  547. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  548. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  549. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +223 -132
  550. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +230 -55
  551. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +2 -0
  552. data/ext/sources/ggml/src/ggml-sycl/pad.cpp +97 -0
  553. data/ext/sources/ggml/src/ggml-sycl/pad.hpp +24 -0
  554. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  555. data/ext/sources/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  556. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +2 -0
  557. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  558. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  559. data/ext/sources/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  560. data/ext/sources/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  561. data/ext/sources/ggml/src/ggml-sycl/roll.cpp +122 -0
  562. data/ext/sources/ggml/src/ggml-sycl/roll.hpp +20 -0
  563. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +65 -59
  564. data/ext/sources/ggml/src/ggml-sycl/set.cpp +73 -0
  565. data/ext/sources/ggml/src/ggml-sycl/set.hpp +5 -0
  566. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  567. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  568. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +330 -165
  569. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +4 -0
  570. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  571. data/ext/sources/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  572. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  573. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +60 -6
  574. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  575. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  576. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +7398 -2635
  577. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  578. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  579. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +43 -3
  580. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  581. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +15 -6
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +56 -39
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +5 -5
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +67 -13
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +158 -16
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +38 -3
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +4 -4
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +2 -2
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +4 -4
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +4 -4
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +2 -2
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +103 -36
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +220 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +139 -45
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +113 -38
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +75 -14
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +19 -17
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +2 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +7 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +4 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +33 -17
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +2 -2
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +227 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +71 -21
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +41 -25
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +2 -2
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +44 -26
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +2 -2
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +2 -2
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +2 -2
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +20 -14
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +4 -6
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +2 -2
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +4 -6
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +4 -6
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +2 -2
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +494 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -556
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +230 -51
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +566 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +72 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +90 -223
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +195 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +41 -5
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +59 -9
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +104 -14
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +234 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +6 -52
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +6 -35
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +6 -35
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +28 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +6 -39
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.glsl +5 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +3 -3
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +30 -8
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +16 -6
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +43 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  740. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +435 -24
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +148 -6
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  743. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +619 -177
  744. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  745. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  746. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +169 -0
  747. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3087 -0
  748. data/ext/sources/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  749. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  750. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  751. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  752. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  753. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +147 -0
  754. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +591 -0
  755. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  756. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  757. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  758. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  759. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  760. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  761. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  762. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  763. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  764. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  765. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  766. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +112 -0
  767. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  768. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  769. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +483 -0
  770. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  771. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  772. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  773. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  774. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  775. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  776. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  777. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  778. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +466 -0
  779. data/ext/sources/ggml/src/ggml.c +901 -129
  780. data/ext/sources/ggml/src/gguf.cpp +8 -1
  781. data/ext/sources/include/whisper.h +1 -0
  782. data/ext/sources/src/CMakeLists.txt +3 -1
  783. data/ext/sources/src/whisper.cpp +124 -81
  784. data/ext/sources/tests/CMakeLists.txt +8 -1
  785. data/ext/sources/tests/test-vad-full.cpp +7 -5
  786. data/ext/sources/tests/test-vad.cpp +3 -3
  787. data/extsources.rb +1 -0
  788. data/lib/whisper/model/uri.rb +17 -18
  789. data/sig/whisper.rbs +126 -2
  790. data/test/test_params.rb +24 -8
  791. data/test/test_segment.rb +0 -1
  792. data/test/test_token.rb +70 -0
  793. data/test/test_vad.rb +1 -1
  794. data/test/test_vad_context.rb +50 -0
  795. data/test/test_vad_segment.rb +19 -0
  796. data/test/test_vad_segments.rb +16 -0
  797. data/test/test_whisper.rb +8 -1
  798. data/whispercpp.gemspec +1 -1
  799. metadata +439 -179
  800. data/ext/sources/build-xcframework.sh +0 -547
  801. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  802. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  803. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  804. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  805. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  806. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  807. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  808. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  809. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  810. data/ext/sources/ggml/src/ggml-cann/Doxyfile +0 -2579
  811. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  812. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  813. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  814. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  815. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  816. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  817. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  818. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  819. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  820. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  821. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  822. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  823. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  824. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  825. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  826. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  827. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -506
  828. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +0 -11
  829. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  830. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  831. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  832. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  833. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  834. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  835. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  836. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  837. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  838. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  839. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  840. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  841. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  842. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  843. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  844. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  845. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  846. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  847. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  848. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  849. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  850. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  851. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  852. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  853. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  854. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  855. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  856. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  857. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  858. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  859. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  860. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  861. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  862. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  863. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  864. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  865. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  866. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  867. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  868. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  869. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  870. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  871. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  872. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  873. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  874. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  875. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  876. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  877. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  878. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  879. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  880. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  881. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  882. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  883. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  884. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  885. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  886. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  887. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  888. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  889. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  890. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  891. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  892. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  893. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  894. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  895. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  896. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  897. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  898. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  899. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  900. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  901. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  902. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  903. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  904. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  905. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  906. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  907. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  908. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  909. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  910. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  911. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  912. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  913. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  914. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  915. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  916. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  917. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  918. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  919. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  920. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  921. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  922. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  923. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  924. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  925. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  926. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  927. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  928. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  929. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  930. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  931. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  932. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  933. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  934. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  935. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  936. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  937. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  938. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  939. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  940. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  941. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  942. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  943. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  944. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  945. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  946. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  947. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  948. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  949. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  950. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  951. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  952. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  953. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  954. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
  955. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +0 -162
  956. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -118
  957. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -99
  958. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -58
  959. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  960. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  961. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  962. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  963. /data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
@@ -0,0 +1,2100 @@
1
+ #include "llama-kv-cache.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-io.h"
5
+ #include "llama-model.h"
6
+ #include "llama-context.h"
7
+
8
+ #include <algorithm>
9
+ #include <cassert>
10
+ #include <cmath>
11
+ #include <cstring>
12
+ #include <limits>
13
+ #include <map>
14
+ #include <stdexcept>
15
+
16
+ //
17
+ // llama_kv_cache
18
+ //
19
+
20
+ llama_kv_cache::llama_kv_cache(
21
+ const llama_model & model,
22
+ ggml_type type_k,
23
+ ggml_type type_v,
24
+ bool v_trans,
25
+ bool offload,
26
+ bool unified,
27
+ uint32_t kv_size,
28
+ uint32_t n_seq_max,
29
+ uint32_t n_pad,
30
+ uint32_t n_swa,
31
+ llama_swa_type swa_type,
32
+ const layer_filter_cb & filter,
33
+ const layer_reuse_cb & reuse) :
34
+ model(model), hparams(model.hparams), v_trans(v_trans),
35
+ n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
36
+
37
+ GGML_ASSERT(kv_size % n_pad == 0);
38
+
39
+ const uint32_t n_layer_kv = hparams.n_layer_kv();
40
+
41
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
42
+ struct ggml_backend_buft_comparator {
43
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
44
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
45
+ }
46
+ };
47
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
48
+
49
+ // create a context for each buffer type
50
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
51
+ auto it = ctx_map.find(buft);
52
+ if (it == ctx_map.end()) {
53
+ ggml_init_params params = {
54
+ /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
55
+ /*.mem_buffer =*/ NULL,
56
+ /*.no_alloc =*/ true,
57
+ };
58
+
59
+ ggml_context * ctx = ggml_init(params);
60
+ if (!ctx) {
61
+ return nullptr;
62
+ }
63
+
64
+ ctx_map.emplace(buft, ctx);
65
+
66
+ return ctx;
67
+ }
68
+
69
+ return it->second.get();
70
+ };
71
+
72
+ GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
73
+
74
+ v_heads.resize(n_stream);
75
+ for (uint32_t s = 0; s < n_stream; ++s) {
76
+ v_heads[s] = 0;
77
+ }
78
+
79
+ v_cells.resize(n_stream);
80
+ for (uint32_t s = 0; s < n_stream; ++s) {
81
+ v_cells[s].resize(kv_size);
82
+ }
83
+
84
+ // by default, all sequence ids are mapped to the 0th stream
85
+ seq_to_stream.resize(LLAMA_MAX_SEQ, 0);
86
+
87
+ if (n_stream > 1) {
88
+ seq_to_stream.resize(n_stream, 0);
89
+ for (uint32_t s = 0; s < n_stream; ++s) {
90
+ seq_to_stream[s] = s;
91
+ }
92
+ }
93
+
94
+ // [TAG_V_CACHE_VARIABLE]
95
+ if (v_trans && hparams.is_n_embd_v_gqa_variable()) {
96
+ LLAMA_LOG_WARN("%s: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to %d\n",
97
+ __func__, hparams.n_embd_v_gqa_max());
98
+ }
99
+
100
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
101
+ if (!hparams.has_kv(il)) {
102
+ LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
103
+ continue;
104
+ }
105
+
106
+ if (filter && !filter(il)) {
107
+ LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
108
+ continue;
109
+ }
110
+
111
+ // [TAG_V_CACHE_VARIABLE]
112
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
113
+ const uint32_t n_embd_v_gqa = !v_trans ? hparams.n_embd_v_gqa(il) : hparams.n_embd_v_gqa_max();
114
+
115
+ const char * dev_name = "CPU";
116
+
117
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
118
+
119
+ if (offload) {
120
+ auto * dev = model.dev_layer(il);
121
+ buft = ggml_backend_dev_buffer_type(dev);
122
+
123
+ dev_name = ggml_backend_dev_name(dev);
124
+ }
125
+
126
+ LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
127
+
128
+ ggml_context * ctx = ctx_for_buft(buft);
129
+ if (!ctx) {
130
+ throw std::runtime_error("failed to create ggml context for kv cache");
131
+ }
132
+
133
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
134
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
135
+
136
+ ggml_format_name(k, "cache_k_l%d", il);
137
+ ggml_format_name(v, "cache_v_l%d", il);
138
+
139
+ std::vector<ggml_tensor *> k_stream;
140
+ std::vector<ggml_tensor *> v_stream;
141
+
142
+ for (uint32_t s = 0; s < n_stream; ++s) {
143
+ k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
144
+ v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
145
+ }
146
+
147
+ map_layer_ids[il] = layers.size();
148
+
149
+ layers.push_back({ il, k, v, k_stream, v_stream, });
150
+ }
151
+
152
+ if (reuse) {
153
+ LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
154
+
155
+ for (uint32_t il = 0; il < hparams.n_layer; il++) {
156
+ const int32_t il_reuse = reuse(il);
157
+
158
+ if (il_reuse < 0) {
159
+ LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
160
+ continue;
161
+ }
162
+
163
+ if (filter && !filter(il)) {
164
+ LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
165
+ continue;
166
+ }
167
+
168
+ GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
169
+
170
+ map_layer_ids[il] = map_layer_ids[il_reuse];
171
+
172
+ LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
173
+ }
174
+ }
175
+
176
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
177
+ for (auto & [buft, ctx] : ctx_map) {
178
+ ggml_backend_buffer_t buf;
179
+ if (model.hparams.no_alloc) {
180
+ buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
181
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
182
+ t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
183
+ }
184
+ } else {
185
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
186
+ }
187
+ if (!buf) {
188
+ throw std::runtime_error("failed to allocate buffer for kv cache");
189
+ }
190
+
191
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
192
+
193
+ ggml_backend_buffer_clear(buf, 0);
194
+ ctxs_bufs.emplace_back(std::move(ctx), buf);
195
+ }
196
+
197
+ {
198
+ const size_t memory_size_k = size_k_bytes();
199
+ const size_t memory_size_v = size_v_bytes();
200
+
201
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
202
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
203
+ ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
204
+ ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
205
+ }
206
+
207
+ const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
208
+ debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
209
+ }
210
+
211
+ void llama_kv_cache::clear(bool data) {
212
+ for (uint32_t s = 0; s < n_stream; ++s) {
213
+ v_cells[s].reset();
214
+ v_heads[s] = 0;
215
+ }
216
+
217
+ if (data) {
218
+ for (auto & [_, buf] : ctxs_bufs) {
219
+ ggml_backend_buffer_clear(buf.get(), 0);
220
+ }
221
+ }
222
+ }
223
+
224
+ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
225
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
226
+
227
+ if (p0 < 0) {
228
+ p0 = 0;
229
+ }
230
+
231
+ if (p1 < 0) {
232
+ p1 = std::numeric_limits<llama_pos>::max();
233
+ }
234
+
235
+ if (seq_id >= 0) {
236
+ auto & cells = v_cells[seq_to_stream[seq_id]];
237
+ auto & head = v_heads[seq_to_stream[seq_id]];
238
+
239
+ uint32_t new_head = cells.size();
240
+
241
+ for (uint32_t i = 0; i < cells.size(); ++i) {
242
+ if (!cells.pos_in(i, p0, p1)) {
243
+ continue;
244
+ }
245
+
246
+ if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
247
+ if (new_head == cells.size()) {
248
+ new_head = i;
249
+ }
250
+ }
251
+ }
252
+
253
+ // If we freed up a slot, set head to it so searching can start there.
254
+ if (new_head != cells.size() && new_head < head) {
255
+ head = new_head;
256
+ }
257
+ } else {
258
+ // match any sequence
259
+ for (uint32_t s = 0; s < n_stream; ++s) {
260
+ auto & cells = v_cells[s];
261
+ auto & head = v_heads[s];
262
+
263
+ uint32_t new_head = cells.size();
264
+
265
+ for (uint32_t i = 0; i < cells.size(); ++i) {
266
+ if (!cells.pos_in(i, p0, p1)) {
267
+ continue;
268
+ }
269
+
270
+ cells.rm(i);
271
+
272
+ if (new_head == cells.size()) {
273
+ new_head = i;
274
+ }
275
+ }
276
+
277
+ // If we freed up a slot, set head to it so searching can start there.
278
+ if (new_head != cells.size() && new_head < head) {
279
+ head = new_head;
280
+ }
281
+ }
282
+ }
283
+
284
+ return true;
285
+ }
286
+
287
+ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
288
+ GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
289
+ GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
290
+
291
+ const auto s0 = seq_to_stream[seq_id_src];
292
+ const auto s1 = seq_to_stream[seq_id_dst];
293
+
294
+ if (s0 == s1) {
295
+ // since both sequences are in the same stream, no data copy is necessary
296
+ // we just have to update the cells meta data
297
+
298
+ auto & cells = v_cells[s0];
299
+
300
+ if (seq_id_src == seq_id_dst) {
301
+ return;
302
+ }
303
+
304
+ if (p0 < 0) {
305
+ p0 = 0;
306
+ }
307
+
308
+ if (p1 < 0) {
309
+ p1 = std::numeric_limits<llama_pos>::max();
310
+ }
311
+
312
+ for (uint32_t i = 0; i < cells.size(); ++i) {
313
+ if (!cells.pos_in(i, p0, p1)) {
314
+ continue;
315
+ }
316
+
317
+ if (cells.seq_has(i, seq_id_src)) {
318
+ cells.seq_add(i, seq_id_dst);
319
+ }
320
+ }
321
+
322
+ return;
323
+ }
324
+
325
+ // cross-stream sequence copies require to copy the actual buffer data
326
+
327
+ bool is_full = true;
328
+
329
+ if (p0 > 0 && p0 + 1 < (int) get_size()) {
330
+ is_full = false;
331
+ }
332
+
333
+ if (p1 > 0 && p1 + 1 < (int) get_size()) {
334
+ is_full = false;
335
+ }
336
+
337
+ GGML_ASSERT(is_full && "seq_cp() is only supported for full KV buffers");
338
+
339
+ // enqueue the copy operation - the buffer copy will be performed during the next update
340
+ sc_info.ssrc.push_back(s0);
341
+ sc_info.sdst.push_back(s1);
342
+
343
+ v_cells[s1].reset();
344
+ for (uint32_t i = 0; i < v_cells[s0].size(); ++i) {
345
+ if (v_cells[s0].seq_has(i, seq_id_src)) {
346
+ llama_pos pos = v_cells[s0].pos_get(i);
347
+ llama_pos shift = v_cells[s0].get_shift(i);
348
+
349
+ llama_kv_cell_ext ext = v_cells[s0].ext_get(i);
350
+
351
+ if (shift != 0) {
352
+ pos -= shift;
353
+ assert(pos >= 0);
354
+ }
355
+
356
+ v_cells[s1].pos_set(i, pos);
357
+ v_cells[s1].seq_add(i, seq_id_dst);
358
+
359
+ if (shift != 0) {
360
+ v_cells[s1].pos_add(i, shift);
361
+ }
362
+
363
+ v_cells[s1].ext_set(i, ext);
364
+ }
365
+ }
366
+
367
+ v_heads[s1] = v_heads[s0];
368
+
369
+ //for (uint32_t s = 0; s < n_stream; ++s) {
370
+ // LLAMA_LOG_WARN("%s: seq %d: min = %d, max = %d\n", __func__, s, v_cells[s].seq_pos_min(s), v_cells[s].seq_pos_max(s));
371
+ //}
372
+ }
373
+
374
+ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
375
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
376
+
377
+ auto & cells = v_cells[seq_to_stream[seq_id]];
378
+ auto & head = v_heads[seq_to_stream[seq_id]];
379
+
380
+ uint32_t new_head = cells.size();
381
+
382
+ for (uint32_t i = 0; i < cells.size(); ++i) {
383
+ if (cells.seq_keep(i, seq_id)) {
384
+ if (new_head == cells.size()) {
385
+ new_head = i;
386
+ }
387
+ }
388
+ }
389
+
390
+ // If we freed up a slot, set head to it so searching can start there.
391
+ if (new_head != cells.size() && new_head < head) {
392
+ head = new_head;
393
+ }
394
+ }
395
+
396
+ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
397
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
398
+ GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1");
399
+
400
+ auto & cells = v_cells[seq_to_stream[seq_id]];
401
+ auto & head = v_heads[seq_to_stream[seq_id]];
402
+
403
+ if (shift == 0) {
404
+ return;
405
+ }
406
+
407
+ uint32_t new_head = cells.size();
408
+
409
+ if (p0 < 0) {
410
+ p0 = 0;
411
+ }
412
+
413
+ if (p1 < 0) {
414
+ p1 = std::numeric_limits<llama_pos>::max();
415
+ }
416
+
417
+ // If there is no range then return early to avoid looping over all cells.
418
+ if (p0 == p1) {
419
+ return;
420
+ }
421
+
422
+ for (uint32_t i = 0; i < cells.size(); ++i) {
423
+ if (!cells.pos_in(i, p0, p1)) {
424
+ continue;
425
+ }
426
+
427
+ if (cells.seq_has(i, seq_id)) {
428
+ if (cells.pos_add(i, shift)) {
429
+ if (new_head == cells.size()) {
430
+ new_head = i;
431
+ }
432
+ }
433
+ }
434
+ }
435
+
436
+ // If we freed up a slot, set head to it so searching can start there.
437
+ // Otherwise we just start the next search from the beginning.
438
+ head = new_head != cells.size() ? new_head : 0;
439
+ }
440
+
441
+ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
442
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
443
+ GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1");
444
+
445
+ auto & cells = v_cells[seq_to_stream[seq_id]];
446
+
447
+ if (d == 1) {
448
+ return;
449
+ }
450
+
451
+ if (p0 < 0) {
452
+ p0 = 0;
453
+ }
454
+
455
+ if (p1 < 0) {
456
+ p1 = std::numeric_limits<llama_pos>::max();
457
+ }
458
+
459
+ // If there is no range then return early to avoid looping over the cache.
460
+ if (p0 == p1) {
461
+ return;
462
+ }
463
+
464
+ for (uint32_t i = 0; i < cells.size(); ++i) {
465
+ if (!cells.pos_in(i, p0, p1)) {
466
+ continue;
467
+ }
468
+
469
+ if (cells.seq_has(i, seq_id)) {
470
+ cells.pos_div(i, d);
471
+ }
472
+ }
473
+ }
474
+
475
+ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const {
476
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
477
+
478
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
479
+
480
+ return cells.seq_pos_min(seq_id);
481
+ }
482
+
483
+ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
484
+ GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
485
+
486
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
487
+
488
+ return cells.seq_pos_max(seq_id);
489
+ }
490
+
491
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
492
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
493
+ for (const auto & [ctx, buf] : ctxs_bufs) {
494
+ ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
495
+
496
+ if (hparams.no_alloc) {
497
+ GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
498
+ ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
499
+ } else {
500
+ // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
501
+ ret[buft] += ggml_backend_buffer_get_size(buf.get());
502
+ }
503
+ }
504
+
505
+ return ret;
506
+ }
507
+
508
+ llama_memory_context_ptr llama_kv_cache::init_batch(
509
+ llama_batch_allocr & balloc,
510
+ uint32_t n_ubatch,
511
+ bool embd_all) {
512
+ GGML_UNUSED(embd_all);
513
+
514
+ do {
515
+ balloc.split_reset();
516
+
517
+ std::vector<llama_ubatch> ubatches;
518
+ while (true) {
519
+ auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
520
+
521
+ if (ubatch.n_tokens == 0) {
522
+ break;
523
+ }
524
+
525
+ ubatches.push_back(std::move(ubatch)); // NOLINT
526
+ }
527
+
528
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
529
+ // failed to find a suitable split
530
+ break;
531
+ }
532
+
533
+ auto sinfos = prepare(ubatches);
534
+ if (sinfos.empty()) {
535
+ break;
536
+ }
537
+
538
+ return std::make_unique<llama_kv_cache_context>(
539
+ this, std::move(sinfos), std::move(ubatches));
540
+ } while (false);
541
+
542
+ return std::make_unique<llama_kv_cache_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
543
+ }
544
+
545
+ llama_memory_context_ptr llama_kv_cache::init_full() {
546
+ return std::make_unique<llama_kv_cache_context>(this);
547
+ }
548
+
549
+ llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
550
+ GGML_UNUSED(optimize);
551
+
552
+ bool do_shift = get_has_shift();
553
+
554
+ return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
555
+ }
556
+
557
+ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
558
+ llama_kv_cache::slot_info_vec_t res;
559
+
560
+ struct state_t {
561
+ slot_info sinfo; // slot info for the ubatch
562
+
563
+ std::vector<uint32_t> v_heads_old; // old positions of the heads, before placing the ubatch
564
+
565
+ std::vector<llama_kv_cells> v_cells; // copy of the old cells, before placing the ubatch
566
+ };
567
+
568
+ // remember the old state of the cells so we can restore it in the end
569
+ std::vector<state_t> states;
570
+
571
+ bool success = true;
572
+
573
+ for (const auto & ubatch : ubatches) {
574
+ // only find a suitable slot for the ubatch. don't modify the cells yet
575
+ const auto sinfo_new = find_slot(ubatch, false);
576
+ if (sinfo_new.empty()) {
577
+ success = false;
578
+ break;
579
+ }
580
+
581
+ // remeber the position that we found
582
+ res.push_back(sinfo_new);
583
+
584
+ // store the old state of the cells in the recovery stack
585
+ {
586
+ state_t state = { sinfo_new, v_heads, {} };
587
+
588
+ for (uint32_t s = 0; s < sinfo_new.n_stream(); ++s) {
589
+ auto & cells = v_cells[sinfo_new.strm[s]];
590
+
591
+ state.v_cells.push_back(cells.cp(sinfo_new.idxs[s]));
592
+ }
593
+
594
+ states.push_back(std::move(state));
595
+ }
596
+
597
+ // now emplace the ubatch
598
+ apply_ubatch(sinfo_new, ubatch);
599
+ }
600
+
601
+ GGML_ASSERT(!states.empty() || !success);
602
+
603
+ // iterate backwards and restore the cells to their original state
604
+ for (auto it = states.rbegin(); it != states.rend(); ++it) {
605
+ const auto & sinfo = it->sinfo;
606
+
607
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
608
+ auto & cells = v_cells[sinfo.strm[s]];
609
+ auto & head = v_heads[sinfo.strm[s]];
610
+
611
+ cells.set(sinfo.idxs[s], it->v_cells[s]);
612
+ head = it->v_heads_old[s];
613
+ }
614
+ }
615
+
616
+ if (!success) {
617
+ return {};
618
+ }
619
+
620
+ return res;
621
+ }
622
+
623
+ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
624
+ bool updated = false;
625
+
626
+ auto * sched = lctx->get_sched();
627
+
628
+ if (!sc_info.empty()) {
629
+ assert(n_stream > 1 && "stream copy should never happen with a single stream");
630
+
631
+ llama_synchronize(lctx);
632
+
633
+ const size_t n_copy = sc_info.ssrc.size();
634
+
635
+ for (size_t i = 0; i < n_copy; ++i) {
636
+ const auto ssrc = sc_info.ssrc[i];
637
+ const auto sdst = sc_info.sdst[i];
638
+
639
+ assert(ssrc < n_stream);
640
+ assert(sdst < n_stream);
641
+
642
+ LLAMA_LOG_DEBUG("%s: copying KV buffer: stream %d to stream %d\n", __func__, ssrc, sdst);
643
+
644
+ assert(ssrc != sdst);
645
+
646
+ for (uint32_t il = 0; il < layers.size(); ++il) {
647
+ const auto & layer = layers[il];
648
+
649
+ ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
650
+ ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
651
+ }
652
+ }
653
+ }
654
+
655
+ if (do_shift) {
656
+ if (!get_can_shift()) {
657
+ GGML_ABORT("The current KV cache / model configuration does not support K-shift");
658
+ }
659
+
660
+ LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
661
+
662
+ // apply K-shift if needed
663
+ if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
664
+ ggml_backend_sched_reset(sched);
665
+
666
+ auto * res = lctx->get_gf_res_reserve();
667
+
668
+ res->reset();
669
+
670
+ auto * gf = build_graph_shift(res, lctx);
671
+ if (!ggml_backend_sched_alloc_graph(sched, gf)) {
672
+ LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
673
+ return updated;
674
+ }
675
+
676
+ res->set_inputs(nullptr);
677
+
678
+ if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
679
+ LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
680
+ return updated;
681
+ }
682
+
683
+ updated = true;
684
+ }
685
+
686
+ for (uint32_t s = 0; s < n_stream; ++s) {
687
+ auto & cells = v_cells[s];
688
+
689
+ cells.reset_shift();
690
+ }
691
+ }
692
+
693
+ return updated;
694
+ }
695
+
696
+ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, bool cont) const {
697
+
698
+ if (debug > 0) {
699
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
700
+ const auto seq_id = ubatch.seq_id_unq[s];
701
+ const auto stream_id = seq_to_stream[seq_id];
702
+ const auto & cells = v_cells[stream_id];
703
+ const uint32_t head_cur = v_heads[stream_id];
704
+
705
+ LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
706
+ __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
707
+
708
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
709
+ std::string ss;
710
+ for (uint32_t i = 0; i < cells.size(); ++i) {
711
+ if (cells.is_empty(i)) {
712
+ ss += '.';
713
+ } else {
714
+ assert(cells.seq_count(i) >= 1);
715
+
716
+ if (cells.seq_count(i) == 1) {
717
+ ss += std::to_string(cells.seq_get(i));
718
+ } else {
719
+ ss += 'M';
720
+ }
721
+ }
722
+ if (i%256 == 255) {
723
+ ss += " *";
724
+ ss += '\n';
725
+ }
726
+ }
727
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
728
+ }
729
+
730
+ if ((debug == 2 && n_swa > 0) || debug > 2) {
731
+ std::string ss;
732
+ for (uint32_t i = 0; i < cells.size(); ++i) {
733
+ std::string cur;
734
+ if (cells.is_empty(i)) {
735
+ cur = '.';
736
+ } else {
737
+ cur = std::to_string(cells.pos_get(i));
738
+ }
739
+ const int n = cur.size();
740
+ for (int j = 0; j < 5 - n; ++j) {
741
+ cur += ' ';
742
+ }
743
+ ss += cur;
744
+ if (i%256 == 255) {
745
+ ss += " *";
746
+ }
747
+ if (i%64 == 63) {
748
+ ss += '\n';
749
+ }
750
+ }
751
+ LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
752
+ }
753
+
754
+ for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
755
+ if (cells.seq_pos_min(s) < 0) {
756
+ continue;
757
+ }
758
+
759
+ LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
760
+ }
761
+ }
762
+ }
763
+
764
+ uint32_t n_tokens = ubatch.n_tokens;
765
+ uint32_t n_seqs = 1;
766
+
767
+ if (n_stream > 1) {
768
+ GGML_ASSERT(n_tokens % ubatch.n_seqs_unq == 0);
769
+
770
+ n_seqs = ubatch.n_seqs_unq;
771
+ n_tokens = n_tokens / n_seqs;
772
+ }
773
+
774
+ slot_info res = {
775
+ /*.s0 =*/ LLAMA_MAX_SEQ,
776
+ /*.s1 =*/ 0,
777
+ /*.strm =*/ { },
778
+ /*.idxs =*/ { },
779
+ };
780
+
781
+ res.resize(n_seqs);
782
+
783
+ for (uint32_t s = 0; s < n_seqs; ++s) {
784
+ const auto seq_id = ubatch.seq_id_unq[s];
785
+
786
+ if (n_stream > 1) {
787
+ GGML_ASSERT(ubatch.n_seq_id[s*n_tokens] == 1);
788
+ GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
789
+ }
790
+
791
+ res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
792
+ res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
793
+
794
+ res.strm[s] = seq_to_stream[seq_id];
795
+ res.idxs[s].reserve(n_tokens);
796
+
797
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
798
+
799
+ uint32_t head_cur = v_heads[seq_to_stream[seq_id]];
800
+
801
+ // if we have enough unused cells before the current head ->
802
+ // better to start searching from the beginning of the cache, hoping to fill it
803
+ if (head_cur > cells.get_used() + 2*n_tokens) {
804
+ head_cur = 0;
805
+ }
806
+
807
+ if (n_tokens > cells.size()) {
808
+ LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
809
+ return { };
810
+ }
811
+
812
+ uint32_t n_tested = 0;
813
+
814
+ // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
815
+ // for non-continuous slots, we test the tokens one by one
816
+ const uint32_t n_test = cont ? n_tokens : 1;
817
+
818
+ while (true) {
819
+ if (head_cur + n_test > cells.size()) {
820
+ n_tested += cells.size() - head_cur;
821
+ head_cur = 0;
822
+ continue;
823
+ }
824
+
825
+ for (uint32_t i = 0; i < n_test; i++) {
826
+ const auto idx = head_cur;
827
+
828
+ head_cur++;
829
+ n_tested++;
830
+
831
+ //const llama_pos pos = ubatch.pos[i];
832
+ //const llama_seq_id seq_id = ubatch.seq_id[i][0];
833
+
834
+ // can we use this cell? either:
835
+ // - the cell is empty
836
+ // - the cell is occupied only by one sequence:
837
+ // - (disabled) mask causally, if the sequence is the same as the one we are inserting
838
+ // - mask SWA, using current max pos for that sequence in the cache
839
+ // always insert in the cell with minimum pos
840
+ bool can_use = cells.is_empty(idx);
841
+
842
+ if (!can_use && cells.seq_count(idx) == 1) {
843
+ const llama_pos pos_cell = cells.pos_get(idx);
844
+
845
+ // (disabled) causal mask
846
+ // note: it's better to purge any "future" tokens beforehand
847
+ //if (cells.seq_has(idx, seq_id)) {
848
+ // can_use = pos_cell >= pos;
849
+ //}
850
+
851
+ if (!can_use) {
852
+ const llama_seq_id seq_id_cell = cells.seq_get(idx);
853
+
854
+ // SWA mask
855
+ if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
856
+ can_use = true;
857
+ }
858
+ }
859
+ }
860
+
861
+ if (can_use) {
862
+ res.idxs[s].push_back(idx);
863
+ } else {
864
+ if (cont) {
865
+ break;
866
+ }
867
+ }
868
+ }
869
+
870
+ if (res.idxs[s].size() == n_tokens) {
871
+ break;
872
+ }
873
+
874
+ if (cont) {
875
+ res.idxs[s].clear();
876
+ }
877
+
878
+ if (n_tested >= cells.size()) {
879
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
880
+ return { };
881
+ }
882
+ }
883
+
884
+ // we didn't find a suitable slot - return empty result
885
+ if (res.idxs[s].size() < n_tokens) {
886
+ return { };
887
+ }
888
+ }
889
+
890
+ assert(res.s1 >= res.s0);
891
+
892
+ return res;
893
+ }
894
+
895
+ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
896
+ // keep track of the max sequence position that we would overwrite with this ubatch
897
+ // for non-SWA cache, this would be always empty
898
+ llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
899
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
900
+ seq_pos_max_rm[s] = -1;
901
+ }
902
+
903
+ assert(ubatch.n_tokens == sinfo.n_stream()*sinfo.size());
904
+
905
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
906
+ for (uint32_t ii = 0; ii < sinfo.size(); ++ii) {
907
+ const uint32_t i = s*sinfo.size() + ii;
908
+
909
+ auto & cells = v_cells[sinfo.strm[s]];
910
+
911
+ const auto idx = sinfo.idxs[s][ii];
912
+
913
+ if (!cells.is_empty(idx)) {
914
+ assert(cells.seq_count(idx) == 1);
915
+
916
+ const llama_seq_id seq_id = cells.seq_get(idx);
917
+ const llama_pos pos = cells.pos_get(idx);
918
+
919
+ seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
920
+
921
+ cells.rm(idx);
922
+ }
923
+
924
+ cells.pos_set(idx, ubatch.pos[i]);
925
+
926
+ if (ubatch.is_pos_2d()) {
927
+ llama_kv_cell_ext ext {
928
+ /*.x =*/ ubatch.pos[i + ubatch.n_tokens*2],
929
+ /*.y =*/ ubatch.pos[i + ubatch.n_tokens],
930
+ };
931
+ cells.ext_set(idx, ext);
932
+ }
933
+
934
+ for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
935
+ cells.seq_add(idx, ubatch.seq_id[i][s]);
936
+ }
937
+ }
938
+ }
939
+
940
+ // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
941
+ // will be present in the cache. so we have to purge any position which is less than those we would overwrite
942
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
943
+ for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
944
+ if (seq_pos_max_rm[s] == -1) {
945
+ continue;
946
+ }
947
+
948
+ GGML_ASSERT(s < seq_to_stream.size());
949
+
950
+ auto & cells = v_cells[seq_to_stream[s]];
951
+
952
+ if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
953
+ LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
954
+ __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
955
+
956
+ seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
957
+ }
958
+ }
959
+
960
+ // move the head at the end of the slot
961
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
962
+ auto & head = v_heads[sinfo.strm[s]];
963
+
964
+ head = sinfo.idxs[s].back() + 1;
965
+ }
966
+ }
967
+
968
+ bool llama_kv_cache::get_can_shift() const {
969
+ return true;
970
+ }
971
+
972
+ uint32_t llama_kv_cache::get_size() const {
973
+ const auto & cells = v_cells[seq_to_stream[0]];
974
+
975
+ return cells.size();
976
+ }
977
+
978
+ uint32_t llama_kv_cache::get_n_stream() const {
979
+ return n_stream;
980
+ }
981
+
982
+ bool llama_kv_cache::get_has_shift() const {
983
+ bool result = false;
984
+
985
+ for (uint32_t s = 0; s < n_stream; ++s) {
986
+ result |= v_cells[s].get_has_shift();
987
+ }
988
+
989
+ return result;
990
+ }
991
+
992
+ uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
993
+ uint32_t result = 0;
994
+
995
+ // pad the n_kv value so that the graph remains constant across batches and can be reused
996
+ // note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
997
+ const uint32_t n_pad_cur = std::max(n_pad, 256u);
998
+
999
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
1000
+ const auto & cells = v_cells[sinfo.strm[s]];
1001
+
1002
+ result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
1003
+ }
1004
+
1005
+ return result;
1006
+ }
1007
+
1008
+ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
1009
+ const int32_t ikv = map_layer_ids.at(il);
1010
+
1011
+ auto * k = layers[ikv].k;
1012
+
1013
+ const uint64_t kv_size = get_size();
1014
+ const uint64_t n_embd_k_gqa = k->ne[0];
1015
+
1016
+ assert(n_embd_k_gqa == hparams.n_embd_k_gqa(il));
1017
+
1018
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
1019
+
1020
+ return ggml_view_4d(ctx, k,
1021
+ hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
1022
+ ggml_row_size(k->type, hparams.n_embd_head_k),
1023
+ ggml_row_size(k->type, n_embd_k_gqa),
1024
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size),
1025
+ ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
1026
+ }
1027
+
1028
+ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
1029
+ const int32_t ikv = map_layer_ids.at(il);
1030
+
1031
+ auto * v = layers[ikv].v;
1032
+
1033
+ const uint64_t kv_size = get_size();
1034
+ const uint64_t n_embd_v_gqa = v->ne[0];
1035
+
1036
+ // [TAG_V_CACHE_VARIABLE]
1037
+ assert(n_embd_v_gqa >= hparams.n_embd_v_gqa(il));
1038
+
1039
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
1040
+
1041
+ if (!v_trans) {
1042
+ // note: v->nb[1] <= v->nb[2]
1043
+ return ggml_view_4d(ctx, v,
1044
+ hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
1045
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
1046
+ ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
1047
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
1048
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
1049
+ }
1050
+
1051
+ // note: v->nb[1] > v->nb[2]
1052
+ return ggml_view_4d(ctx, v,
1053
+ n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
1054
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
1055
+ ggml_row_size(v->type, kv_size), // v->nb[2]
1056
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
1057
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
1058
+ }
1059
+
1060
+ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1061
+ GGML_UNUSED(sinfo);
1062
+
1063
+ const int32_t ikv = map_layer_ids.at(il);
1064
+
1065
+ ggml_tensor * k = layers[ikv].k;
1066
+
1067
+ const int64_t n_embd_head = k_cur->ne[0];
1068
+ const int64_t n_head = k_cur->ne[1];
1069
+ const int64_t n_tokens = k_cur->ne[2];
1070
+
1071
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1072
+
1073
+ // we can merge dims 0 and 1
1074
+ // TODO: add ggml helper function for this?
1075
+ GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
1076
+
1077
+ k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
1078
+
1079
+ const int64_t n_stream = k->ne[2];
1080
+
1081
+ if (n_stream > 1) {
1082
+ const int64_t kv_size = get_size();
1083
+
1084
+ assert(n_embd_gqa == k->ne[0]);
1085
+ assert(kv_size == k->ne[1]);
1086
+
1087
+ // merge the buffer across all streams because the idxs are global
1088
+ k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
1089
+ }
1090
+
1091
+ // store the current K values into the cache
1092
+ return ggml_set_rows(ctx, k, k_cur, k_idxs);
1093
+ }
1094
+
1095
+ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1096
+ GGML_UNUSED(sinfo);
1097
+
1098
+ const int32_t ikv = map_layer_ids.at(il);
1099
+
1100
+ auto * v = layers[ikv].v;
1101
+
1102
+ const int64_t n_embd_head = v_cur->ne[0];
1103
+ const int64_t n_head = v_cur->ne[1];
1104
+ const int64_t n_tokens = v_cur->ne[2];
1105
+
1106
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1107
+
1108
+ // we can merge dims 0 and 1
1109
+ GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
1110
+
1111
+ const int64_t n_stream = v->ne[2];
1112
+
1113
+ // take this branch when FA is enabled (the V cache is not transposed)
1114
+ if (!v_trans) {
1115
+ v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
1116
+
1117
+ if (n_stream > 1) {
1118
+ const int64_t kv_size = get_size();
1119
+
1120
+ assert(n_embd_gqa == v->ne[0]);
1121
+ assert(kv_size == v->ne[1]);
1122
+
1123
+ // merge the buffer across all streams because the idxs are global
1124
+ v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
1125
+ }
1126
+
1127
+ return ggml_set_rows(ctx, v, v_cur, v_idxs);
1128
+ }
1129
+
1130
+ if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
1131
+ // we can merge dims 0, 1 and 2
1132
+ v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
1133
+ } else {
1134
+ // otherwise -> make a copy to get contiguous data
1135
+ v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
1136
+ }
1137
+
1138
+ // [TAG_V_CACHE_VARIABLE]
1139
+ if (n_embd_gqa < v->ne[0]) {
1140
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
1141
+ }
1142
+
1143
+ // in this branch the v_idxs are constructed in such a way that each row is a single head element
1144
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
1145
+
1146
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
1147
+
1148
+ return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1149
+ }
1150
+
1151
+ ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1152
+ const uint32_t n_tokens = ubatch.n_tokens;
1153
+
1154
+ ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
1155
+
1156
+ ggml_set_input(k_idxs);
1157
+
1158
+ return k_idxs;
1159
+ }
1160
+
1161
+ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
1162
+ const uint32_t n_tokens = ubatch.n_tokens;
1163
+
1164
+ ggml_tensor * v_idxs;
1165
+
1166
+ if (!v_trans) {
1167
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
1168
+ } else {
1169
+ v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens*hparams.n_embd_v_gqa_max());
1170
+ }
1171
+
1172
+ ggml_set_input(v_idxs);
1173
+
1174
+ return v_idxs;
1175
+ }
1176
+
1177
+ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1178
+ const uint32_t n_tokens = ubatch->n_tokens;
1179
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1180
+
1181
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1182
+ int64_t * data = (int64_t *) dst->data;
1183
+
1184
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
1185
+ const int64_t offs = sinfo.strm[s]*get_size();
1186
+
1187
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
1188
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
1189
+ }
1190
+ }
1191
+ }
1192
+
1193
+ void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1194
+ const uint32_t n_tokens = ubatch->n_tokens;
1195
+ GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1196
+
1197
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1198
+ int64_t * data = (int64_t *) dst->data;
1199
+
1200
+ if (!v_trans) {
1201
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
1202
+ const int64_t offs = sinfo.strm[s]*get_size();
1203
+
1204
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
1205
+ data[s*sinfo.size() + i] = offs + sinfo.idxs[s][i];
1206
+ }
1207
+ }
1208
+ } else {
1209
+ // note: the V cache is transposed when not using flash attention
1210
+ const int64_t kv_size = get_size();
1211
+
1212
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa_max();
1213
+
1214
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
1215
+ const int64_t offs = sinfo.strm[s]*kv_size*n_embd_v_gqa;
1216
+
1217
+ for (uint32_t i = 0; i < sinfo.size(); ++i) {
1218
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1219
+ data[s*sinfo.size()*n_embd_v_gqa + i*n_embd_v_gqa + j] = offs + j*kv_size + sinfo.idxs[s][i];
1220
+ }
1221
+ }
1222
+ }
1223
+ }
1224
+ }
1225
+
1226
+ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
1227
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1228
+
1229
+ int32_t * data = (int32_t *) dst->data;
1230
+
1231
+ for (uint32_t s = 0; s < n_stream; ++s) {
1232
+ const auto & cells = v_cells[s];
1233
+
1234
+ for (uint32_t i = 0; i < cells.size(); ++i) {
1235
+ data[s*cells.size() + i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
1236
+ }
1237
+ }
1238
+ }
1239
+
1240
+ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1241
+ const uint32_t n_tokens = ubatch->n_tokens;
1242
+
1243
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1244
+ float * data = (float *) dst->data;
1245
+
1246
+ const int64_t n_kv = dst->ne[0];
1247
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
1248
+
1249
+ GGML_ASSERT(n_tokens%n_stream == 0);
1250
+
1251
+ // n_tps == n_tokens_per_stream
1252
+ const int64_t n_tps = n_tokens/n_stream;
1253
+
1254
+ std::fill(data, data + ggml_nelements(dst), -INFINITY);
1255
+
1256
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
1257
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
1258
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
1259
+ // Causal mask:
1260
+ // xxx-------
1261
+ // xxxx------
1262
+ // xxxxx-----
1263
+ // Non-causal mask:
1264
+ // xxxxx-----
1265
+ // xxxxx-----
1266
+ // xxxxx-----
1267
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
1268
+ // TODO: optimize this section
1269
+ for (uint32_t h = 0; h < 1; ++h) {
1270
+ for (uint32_t s = 0; s < n_stream; ++s) {
1271
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
1272
+ const uint32_t i = s*n_tps + ii;
1273
+
1274
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
1275
+
1276
+ const auto & cells = v_cells[seq_to_stream[seq_id]];
1277
+
1278
+ const llama_pos p1 = ubatch->pos[i];
1279
+
1280
+ // for M-RoPE
1281
+ const bool is_2d = ubatch->is_pos_2d();
1282
+ const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1283
+ const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1284
+
1285
+ const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
1286
+
1287
+ for (uint32_t j = 0; j < n_kv; ++j) {
1288
+ if (cells.is_empty(j)) {
1289
+ continue;
1290
+ }
1291
+
1292
+ // mask the token if not the same sequence
1293
+ if (!cells.seq_has(j, seq_id)) {
1294
+ continue;
1295
+ }
1296
+
1297
+ const llama_pos p0 = cells.pos_get(j);
1298
+
1299
+ // mask future tokens
1300
+ if (causal_attn && p0 > p1) {
1301
+ continue;
1302
+ }
1303
+
1304
+ // M-RoPE causal mask
1305
+ if (causal_attn && is_2d && p0 == p1) {
1306
+ const auto & p0_ext = cells.ext_get(j);
1307
+ if (p0_ext.is_2d_gt(p1_x, p1_y)) {
1308
+ continue;
1309
+ }
1310
+ }
1311
+
1312
+ // apply SWA if any
1313
+ if (is_masked_swa(p0, p1)) {
1314
+ continue;
1315
+ }
1316
+
1317
+ data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
1318
+ }
1319
+ }
1320
+ }
1321
+ }
1322
+ }
1323
+
1324
+ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
1325
+ const int64_t n_tokens = ubatch->n_tokens;
1326
+
1327
+ GGML_ASSERT(n_stream == 1 && "TODO: support multiple streams");
1328
+ const auto & cells = v_cells[0];
1329
+
1330
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1331
+ GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
1332
+
1333
+ int32_t * data = (int32_t *) dst->data;
1334
+
1335
+ const int32_t n_kv = dst->ne[0];
1336
+
1337
+ for (int h = 0; h < 1; ++h) {
1338
+ for (int i = 0; i < n_tokens; ++i) {
1339
+ for (int j = 0; j < n_kv; ++j) {
1340
+ // the position when the cells is empty is irrelevant - it will be masked out later in the attention
1341
+ const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);
1342
+
1343
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = llama_relative_position_bucket(p0, ubatch->pos[i], hparams.n_rel_attn_bkts, false);
1344
+ }
1345
+ }
1346
+ }
1347
+ }
1348
+
1349
+ size_t llama_kv_cache::total_size() const {
1350
+ size_t size = 0;
1351
+
1352
+ for (const auto & [_, buf] : ctxs_bufs) {
1353
+ size += ggml_backend_buffer_get_size(buf.get());
1354
+ }
1355
+
1356
+ return size;
1357
+ }
1358
+
1359
+ size_t llama_kv_cache::size_k_bytes() const {
1360
+ size_t size_k_bytes = 0;
1361
+
1362
+ for (const auto & layer : layers) {
1363
+ size_k_bytes += ggml_nbytes(layer.k);
1364
+ }
1365
+
1366
+ return size_k_bytes;
1367
+ }
1368
+
1369
+ size_t llama_kv_cache::size_v_bytes() const {
1370
+ size_t size_v_bytes = 0;
1371
+
1372
+ for (const auto & layer : layers) {
1373
+ size_v_bytes += ggml_nbytes(layer.v);
1374
+ }
1375
+
1376
+ return size_v_bytes;
1377
+ }
1378
+
1379
+ ggml_tensor * llama_kv_cache::build_rope_shift(
1380
+ const llama_cparams & cparams,
1381
+ ggml_context * ctx,
1382
+ ggml_tensor * cur,
1383
+ ggml_tensor * shift,
1384
+ ggml_tensor * factors,
1385
+ float freq_base,
1386
+ float freq_scale) const {
1387
+ const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
1388
+
1389
+ const auto & yarn_ext_factor = cparams.yarn_ext_factor;
1390
+ const auto & yarn_beta_fast = cparams.yarn_beta_fast;
1391
+ const auto & yarn_beta_slow = cparams.yarn_beta_slow;
1392
+ const auto & yarn_attn_factor = cparams.yarn_attn_factor;
1393
+
1394
+ const auto & n_rot = hparams.n_rot;
1395
+ const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
1396
+ // @ngxson : this is a workaround
1397
+ // for M-RoPE, we want to rotate the whole vector when doing KV shift
1398
+ // a normal RoPE should work, we just need to use the correct ordering
1399
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13870
1400
+ ? LLAMA_ROPE_TYPE_NEOX
1401
+ : hparams.rope_type;
1402
+
1403
+ ggml_tensor * tmp;
1404
+
1405
+ if (ggml_is_quantized(cur->type)) {
1406
+ // dequantize to f32 -> RoPE -> quantize back
1407
+ tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
1408
+
1409
+ tmp = ggml_rope_ext(ctx, tmp,
1410
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1411
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
1412
+
1413
+ tmp = ggml_cpy(ctx, tmp, cur);
1414
+ } else {
1415
+ // we rotate only the first n_rot dimensions
1416
+ tmp = ggml_rope_ext_inplace(ctx, cur,
1417
+ shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
1418
+ yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
1419
+ }
1420
+
1421
+ return tmp;
1422
+ }
1423
+
1424
+ class llm_graph_input_k_shift : public llm_graph_input_i {
1425
+ public:
1426
+ llm_graph_input_k_shift(const llama_kv_cache * kv_self) : kv_self(kv_self) {}
1427
+ virtual ~llm_graph_input_k_shift() = default;
1428
+
1429
+ void set_input(const llama_ubatch * ubatch) override;
1430
+
1431
+ ggml_tensor * k_shift; // I32 [kv_size*n_stream]
1432
+
1433
+ const llama_kv_cache * kv_self;
1434
+ };
1435
+
1436
+ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
1437
+ GGML_UNUSED(ubatch);
1438
+
1439
+ if (k_shift) {
1440
+ kv_self->set_input_k_shift(k_shift);
1441
+ }
1442
+ }
1443
+
1444
+ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
1445
+ auto * ctx = res->get_ctx();
1446
+ auto * gf = res->get_gf();
1447
+
1448
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
1449
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
1450
+
1451
+ auto inp = std::make_unique<llm_graph_input_k_shift>(this);
1452
+
1453
+ inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
1454
+ ggml_set_input(inp->k_shift);
1455
+
1456
+ const auto & cparams = lctx->get_cparams();
1457
+
1458
+ for (const auto & layer : layers) {
1459
+ const uint32_t il = layer.il;
1460
+
1461
+ const int64_t n_head_kv = hparams.n_head_kv(il);
1462
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1463
+
1464
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
1465
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
1466
+
1467
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
1468
+
1469
+ ggml_tensor * k =
1470
+ ggml_view_3d(ctx, layer.k,
1471
+ n_embd_head_k, n_head_kv, get_size()*n_stream,
1472
+ ggml_row_size(layer.k->type, n_embd_head_k),
1473
+ ggml_row_size(layer.k->type, n_embd_k_gqa),
1474
+ 0);
1475
+
1476
+ ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
1477
+
1478
+ ggml_build_forward_expand(gf, cur);
1479
+ }
1480
+
1481
+ res->add_input(std::move(inp));
1482
+
1483
+ return gf;
1484
+ }
1485
+
1486
+ bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1487
+ return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
1488
+ }
1489
+
1490
+ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1491
+ GGML_UNUSED(flags);
1492
+
1493
+ io.write(&n_stream, sizeof(n_stream));
1494
+
1495
+ for (uint32_t s = 0; s < n_stream; ++s) {
1496
+ cell_ranges_t cr { s, {} };
1497
+
1498
+ uint32_t cell_count = 0;
1499
+
1500
+ const auto & cells = v_cells[s];
1501
+
1502
+ // Count the number of cells with the specified seq_id
1503
+ // Find all the ranges of cells with this seq id (or all, when -1)
1504
+ uint32_t cell_range_begin = cells.size();
1505
+
1506
+ for (uint32_t i = 0; i < cells.size(); ++i) {
1507
+ if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
1508
+ ++cell_count;
1509
+ if (cell_range_begin == cells.size()) {
1510
+ cell_range_begin = i;
1511
+ }
1512
+ } else {
1513
+ if (cell_range_begin != cells.size()) {
1514
+ cr.data.emplace_back(cell_range_begin, i);
1515
+ cell_range_begin = cells.size();
1516
+ }
1517
+ }
1518
+ }
1519
+
1520
+ if (cell_range_begin != cells.size()) {
1521
+ cr.data.emplace_back(cell_range_begin, cells.size());
1522
+ }
1523
+
1524
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
1525
+ uint32_t cell_count_check = 0;
1526
+ for (const auto & range : cr.data) {
1527
+ cell_count_check += range.second - range.first;
1528
+ }
1529
+ GGML_ASSERT(cell_count == cell_count_check);
1530
+
1531
+ io.write(&cell_count, sizeof(cell_count));
1532
+
1533
+ // skip empty streams
1534
+ if (cell_count == 0) {
1535
+ continue;
1536
+ }
1537
+
1538
+ state_write_meta(io, cr, seq_id);
1539
+ state_write_data(io, cr);
1540
+ }
1541
+ }
1542
+
1543
+ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1544
+ GGML_UNUSED(flags);
1545
+
1546
+ GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
1547
+
1548
+ uint32_t n_stream_cur;
1549
+ io.read_to(&n_stream_cur, sizeof(n_stream_cur));
1550
+ if (n_stream_cur != n_stream) {
1551
+ throw std::runtime_error("n_stream mismatch");
1552
+ }
1553
+
1554
+ for (uint32_t s = 0; s < n_stream; ++s) {
1555
+ uint32_t cell_count;
1556
+ io.read_to(&cell_count, sizeof(cell_count));
1557
+
1558
+ if (cell_count == 0) {
1559
+ continue;
1560
+ }
1561
+
1562
+ const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
1563
+
1564
+ slot_info sinfo;
1565
+
1566
+ bool res = true;
1567
+ res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
1568
+ res = res && state_read_data(io, strm, cell_count, sinfo);
1569
+
1570
+ if (!res) {
1571
+ if (seq_id == -1) {
1572
+ clear(true);
1573
+ } else {
1574
+ seq_rm(seq_id, -1, -1);
1575
+ }
1576
+ throw std::runtime_error("failed to restore kv cache");
1577
+ }
1578
+ }
1579
+ }
1580
+
1581
+ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {
1582
+ const auto & cells = v_cells[cr.strm];
1583
+
1584
+ for (const auto & range : cr.data) {
1585
+ for (uint32_t i = range.first; i < range.second; ++i) {
1586
+ std::vector<llama_seq_id> seq_ids;
1587
+
1588
+ for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
1589
+ if (cur == seq_id || seq_id == -1) {
1590
+ if (cells.seq_has(i, cur)) {
1591
+ seq_ids.push_back(cur);
1592
+ }
1593
+ }
1594
+ }
1595
+
1596
+ const llama_pos pos = cells.pos_get(i);
1597
+ const uint32_t n_seq_id = seq_ids.size();
1598
+
1599
+ io.write(&pos, sizeof(pos));
1600
+ io.write(&n_seq_id, sizeof(n_seq_id));
1601
+
1602
+ // TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
1603
+ // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
1604
+
1605
+ for (const auto & seq_id : seq_ids) {
1606
+ io.write(&seq_id, sizeof(seq_id));
1607
+ }
1608
+ }
1609
+ }
1610
+ }
1611
+
1612
+ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const {
1613
+ const auto & cells = v_cells[cr.strm];
1614
+
1615
+ const uint32_t v_trans = this->v_trans ? 1 : 0;
1616
+ const uint32_t n_layer = layers.size();
1617
+
1618
+ io.write(&v_trans, sizeof(v_trans));
1619
+ io.write(&n_layer, sizeof(n_layer));
1620
+
1621
+ std::vector<uint8_t> tmp_buf;
1622
+
1623
+ // Iterate and write all the keys first, each row is a cell
1624
+ // Get whole range at a time
1625
+ for (const auto & layer : layers) {
1626
+ const uint32_t il = layer.il;
1627
+
1628
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1629
+
1630
+ auto * k = layer.k_stream[cr.strm];
1631
+
1632
+ // Write key type
1633
+ const int32_t k_type_i = (int32_t) k->type;
1634
+ io.write(&k_type_i, sizeof(k_type_i));
1635
+
1636
+ // Write row size of key
1637
+ const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
1638
+ io.write(&k_size_row, sizeof(k_size_row));
1639
+
1640
+ // Read each range of cells of k_size length each into tmp_buf and write out
1641
+ for (const auto & range : cr.data) {
1642
+ const size_t range_size = range.second - range.first;
1643
+ const size_t buf_size = range_size * k_size_row;
1644
+ io.write_tensor(k, range.first * k_size_row, buf_size);
1645
+ }
1646
+ }
1647
+
1648
+ if (!v_trans) {
1649
+ for (const auto & layer : layers) {
1650
+ const uint32_t il = layer.il;
1651
+
1652
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1653
+
1654
+ auto * v = layer.v_stream[cr.strm];
1655
+
1656
+ // Write value type
1657
+ const int32_t v_type_i = (int32_t) v->type;
1658
+ io.write(&v_type_i, sizeof(v_type_i));
1659
+
1660
+ // Write row size of value
1661
+ const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
1662
+ io.write(&v_size_row, sizeof(v_size_row));
1663
+
1664
+ // Read each range of cells of v_size length each into tmp_buf and write out
1665
+ for (const auto & range : cr.data) {
1666
+ const size_t range_size = range.second - range.first;
1667
+ const size_t buf_size = range_size * v_size_row;
1668
+ io.write_tensor(v, range.first * v_size_row, buf_size);
1669
+ }
1670
+ }
1671
+ } else {
1672
+ // When v is transposed, we also need the element size and get the element ranges from each row
1673
+ const uint32_t kv_size = cells.size();
1674
+
1675
+ for (const auto & layer : layers) {
1676
+ const uint32_t il = layer.il;
1677
+
1678
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1679
+
1680
+ auto * v = layer.v_stream[cr.strm];
1681
+
1682
+ // Write value type
1683
+ const int32_t v_type_i = (int32_t) v->type;
1684
+ io.write(&v_type_i, sizeof(v_type_i));
1685
+
1686
+ // Write element size
1687
+ const uint32_t v_size_el = ggml_type_size(v->type);
1688
+ io.write(&v_size_el, sizeof(v_size_el));
1689
+
1690
+ // Write GQA embedding size
1691
+ io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
1692
+
1693
+ // For each row, we get the element values of each cell
1694
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1695
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
1696
+ for (const auto & range : cr.data) {
1697
+ const size_t range_size = range.second - range.first;
1698
+ const size_t src_offset = (range.first + j * kv_size) * v_size_el;
1699
+ const size_t buf_size = range_size * v_size_el;
1700
+ io.write_tensor(v, src_offset, buf_size);
1701
+ }
1702
+ }
1703
+ }
1704
+ }
1705
+ }
1706
+
1707
+ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
1708
+ auto & cells = v_cells[strm];
1709
+ auto & head = v_heads[strm];
1710
+
1711
+ if (dest_seq_id != -1) {
1712
+ // single sequence
1713
+ seq_rm(dest_seq_id, -1, -1);
1714
+
1715
+ llama_batch_allocr balloc(hparams.n_pos_per_embd());
1716
+
1717
+ llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
1718
+
1719
+ ubatch.seq_id_unq[0] = dest_seq_id;
1720
+
1721
+ for (uint32_t i = 0; i < cell_count; ++i) {
1722
+ llama_pos pos;
1723
+ uint32_t n_seq_id;
1724
+
1725
+ io.read_to(&pos, sizeof(pos));
1726
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1727
+
1728
+ if (n_seq_id != 1) {
1729
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
1730
+ return false;
1731
+ }
1732
+
1733
+ // read the sequence id, but directly discard it - we will use dest_seq_id instead
1734
+ {
1735
+ llama_seq_id seq_id;
1736
+ io.read_to(&seq_id, sizeof(seq_id));
1737
+ }
1738
+
1739
+ ubatch.pos[i] = pos;
1740
+ ubatch.n_seq_id[i] = n_seq_id;
1741
+ ubatch.seq_id[i] = &dest_seq_id;
1742
+ }
1743
+
1744
+ sinfo = find_slot(ubatch, false);
1745
+ if (sinfo.empty()) {
1746
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
1747
+ return false;
1748
+ }
1749
+
1750
+ // TODO: we cannot yet restore llama_kv_cell_ext as the apply_ubatch() does not support it yet
1751
+ // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
1752
+ apply_ubatch(sinfo, ubatch);
1753
+
1754
+ LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
1755
+
1756
+ // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
1757
+ GGML_ASSERT(sinfo.n_stream() == 1);
1758
+ GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
1759
+ for (uint32_t i = 0; i < cell_count; ++i) {
1760
+ const uint32_t idx = sinfo.idxs[0][i];
1761
+ GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
1762
+ GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
1763
+ }
1764
+ } else {
1765
+ // whole KV cache restore
1766
+
1767
+ if (cell_count > cells.size()) {
1768
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
1769
+ return false;
1770
+ }
1771
+
1772
+ clear(true);
1773
+
1774
+ for (uint32_t i = 0; i < cell_count; ++i) {
1775
+ llama_pos pos;
1776
+ uint32_t n_seq_id;
1777
+
1778
+ io.read_to(&pos, sizeof(pos));
1779
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
1780
+
1781
+ cells.pos_set(i, pos);
1782
+
1783
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
1784
+ llama_seq_id seq_id;
1785
+ io.read_to(&seq_id, sizeof(seq_id));
1786
+
1787
+ if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
1788
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
1789
+ return false;
1790
+ }
1791
+
1792
+ cells.seq_add(i, seq_id);
1793
+ }
1794
+ }
1795
+
1796
+ // Create contiguous slot_info for whole cache restore
1797
+ sinfo.s0 = strm;
1798
+ sinfo.s1 = strm;
1799
+ sinfo.resize(1);
1800
+ sinfo.strm[0] = strm;
1801
+ sinfo.idxs[0].resize(cell_count);
1802
+ for (uint32_t i = 0; i < cell_count; ++i) {
1803
+ sinfo.idxs[0][i] = i;
1804
+ }
1805
+
1806
+ head = 0;
1807
+ }
1808
+
1809
+ return true;
1810
+ }
1811
+
1812
+ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
1813
+ auto & cells = v_cells[strm];
1814
+
1815
+ uint32_t v_trans;
1816
+ uint32_t n_layer;
1817
+
1818
+ io.read_to(&v_trans, sizeof(v_trans));
1819
+ io.read_to(&n_layer, sizeof(n_layer));
1820
+
1821
+ if (n_layer != layers.size()) {
1822
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
1823
+ return false;
1824
+ }
1825
+
1826
+ if (cell_count > cells.size()) {
1827
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
1828
+ return false;
1829
+ }
1830
+
1831
+ if (this->v_trans != (bool) v_trans) {
1832
+ LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
1833
+ return false;
1834
+ }
1835
+
1836
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
1837
+ for (const auto & layer : layers) {
1838
+ const uint32_t il = layer.il;
1839
+
1840
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1841
+
1842
+ auto * k = layer.k_stream[strm];
1843
+
1844
+ // Read type of key
1845
+ int32_t k_type_i_ref;
1846
+ io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
1847
+ const int32_t k_type_i = (int32_t) k->type;
1848
+ if (k_type_i != k_type_i_ref) {
1849
+ LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
1850
+ return false;
1851
+ }
1852
+
1853
+ // Read row size of key
1854
+ uint64_t k_size_row_ref;
1855
+ io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
1856
+ const size_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
1857
+ if (k_size_row != k_size_row_ref) {
1858
+ LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
1859
+ return false;
1860
+ }
1861
+
1862
+ if (cell_count) {
1863
+ if (sinfo.is_contiguous()) {
1864
+ // Fast path: contiguous cells, single memcpy
1865
+ ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
1866
+ } else {
1867
+ // Slow path: scatter to non-contiguous positions
1868
+ const void * src = io.read(cell_count * k_size_row);
1869
+ for (uint32_t i = 0; i < cell_count; ++i) {
1870
+ const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
1871
+ ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
1872
+ }
1873
+ }
1874
+ }
1875
+ }
1876
+
1877
+ if (!this->v_trans) {
1878
+ for (const auto & layer : layers) {
1879
+ const uint32_t il = layer.il;
1880
+
1881
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1882
+
1883
+ auto * v = layer.v_stream[strm];
1884
+
1885
+ // Read type of value
1886
+ int32_t v_type_i_ref;
1887
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1888
+ const int32_t v_type_i = (int32_t) v->type;
1889
+ if (v_type_i != v_type_i_ref) {
1890
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1891
+ return false;
1892
+ }
1893
+
1894
+ // Read row size of value
1895
+ uint64_t v_size_row_ref;
1896
+ io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
1897
+ const size_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
1898
+ if (v_size_row != v_size_row_ref) {
1899
+ LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
1900
+ return false;
1901
+ }
1902
+
1903
+ if (cell_count) {
1904
+ if (sinfo.is_contiguous()) {
1905
+ // Fast path: contiguous cells, single memcpy
1906
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
1907
+ } else {
1908
+ // Slow path: scatter to non-contiguous positions
1909
+ const void * src = io.read(cell_count * v_size_row);
1910
+ for (uint32_t i = 0; i < cell_count; ++i) {
1911
+ const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
1912
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
1913
+ }
1914
+ }
1915
+ }
1916
+ }
1917
+ } else {
1918
+ // For each layer, read the values for each cell (transposed)
1919
+ for (const auto & layer : layers) {
1920
+ const uint32_t il = layer.il;
1921
+
1922
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1923
+
1924
+ auto * v = layer.v_stream[strm];
1925
+
1926
+ // Read type of value
1927
+ int32_t v_type_i_ref;
1928
+ io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
1929
+ const int32_t v_type_i = (int32_t) v->type;
1930
+ if (v_type_i != v_type_i_ref) {
1931
+ LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
1932
+ return false;
1933
+ }
1934
+
1935
+ // Read element size of value
1936
+ uint32_t v_size_el_ref;
1937
+ io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
1938
+ const size_t v_size_el = ggml_type_size(v->type);
1939
+ if (v_size_el != v_size_el_ref) {
1940
+ LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
1941
+ return false;
1942
+ }
1943
+
1944
+ // Read GQA embedding size
1945
+ uint32_t n_embd_v_gqa_ref;
1946
+ io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
1947
+ if (n_embd_v_gqa != n_embd_v_gqa_ref) {
1948
+ LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
1949
+ return false;
1950
+ }
1951
+
1952
+ if (cell_count) {
1953
+ if (sinfo.is_contiguous()) {
1954
+ // Fast path: contiguous cells
1955
+ const uint32_t h = sinfo.head();
1956
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1957
+ const size_t dst_offset = (h + j * cells.size()) * v_size_el;
1958
+ ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
1959
+ }
1960
+ } else {
1961
+ // Slow path: scatter to non-contiguous positions
1962
+ for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
1963
+ const void * src = io.read(cell_count * v_size_el);
1964
+ for (uint32_t i = 0; i < cell_count; ++i) {
1965
+ const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
1966
+ ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
1967
+ }
1968
+ }
1969
+ }
1970
+ }
1971
+ }
1972
+ }
1973
+
1974
+ return true;
1975
+ }
1976
+
1977
+ //
1978
+ // llama_kv_cache_context
1979
+ //
1980
+
1981
+ llama_kv_cache_context::llama_kv_cache_context(llama_memory_status status) : status(status) {}
1982
+
1983
+ llama_kv_cache_context::llama_kv_cache_context(
1984
+ llama_kv_cache * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
1985
+ n_kv = kv->get_size();
1986
+
1987
+ const uint32_t n_stream = kv->get_n_stream();
1988
+
1989
+ // create a dummy slot info - the actual data is irrelevant. we just need to build the graph
1990
+ sinfos.resize(1);
1991
+ sinfos[0].s0 = 0;
1992
+ sinfos[0].s1 = n_stream - 1;
1993
+ sinfos[0].idxs.resize(n_stream);
1994
+ for (uint32_t s = 0; s < n_stream; ++s) {
1995
+ sinfos[0].strm.push_back(s);
1996
+ sinfos[0].idxs[s].resize(1, 0);
1997
+ }
1998
+ }
1999
+
2000
+ llama_kv_cache_context::llama_kv_cache_context(
2001
+ llama_kv_cache * kv,
2002
+ llama_context * lctx,
2003
+ bool do_shift,
2004
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
2005
+ if (!do_shift && this->sc_info.empty()) {
2006
+ status = LLAMA_MEMORY_STATUS_NO_UPDATE;
2007
+ }
2008
+ }
2009
+
2010
+ llama_kv_cache_context::llama_kv_cache_context(
2011
+ llama_kv_cache * kv,
2012
+ llama_kv_cache::slot_info_vec_t sinfos,
2013
+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
2014
+ }
2015
+
2016
+ llama_kv_cache_context::~llama_kv_cache_context() = default;
2017
+
2018
+ bool llama_kv_cache_context::next() {
2019
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
2020
+
2021
+ if (++i_cur >= ubatches.size()) {
2022
+ return false;
2023
+ }
2024
+
2025
+ return true;
2026
+ }
2027
+
2028
+ bool llama_kv_cache_context::apply() {
2029
+ assert(!llama_memory_status_is_fail(status));
2030
+
2031
+ // no ubatches -> this is a KV cache update
2032
+ if (ubatches.empty()) {
2033
+ kv->update(lctx, do_shift, sc_info);
2034
+
2035
+ return true;
2036
+ }
2037
+
2038
+ kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
2039
+ n_kv = kv->get_n_kv(sinfos[i_cur]);
2040
+
2041
+ return true;
2042
+ }
2043
+
2044
+ llama_memory_status llama_kv_cache_context::get_status() const {
2045
+ return status;
2046
+ }
2047
+
2048
+ const llama_ubatch & llama_kv_cache_context::get_ubatch() const {
2049
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
2050
+
2051
+ return ubatches[i_cur];
2052
+ }
2053
+
2054
+ uint32_t llama_kv_cache_context::get_n_kv() const {
2055
+ return n_kv;
2056
+ }
2057
+
2058
+ ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
2059
+ return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
2060
+ }
2061
+
2062
+ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) const {
2063
+ return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
2064
+ }
2065
+
2066
+ ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
2067
+ return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
2068
+ }
2069
+
2070
+ ggml_tensor * llama_kv_cache_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
2071
+ return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
2072
+ }
2073
+
2074
+ ggml_tensor * llama_kv_cache_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2075
+ return kv->build_input_k_idxs(ctx, ubatch);
2076
+ }
2077
+
2078
+ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
2079
+ return kv->build_input_v_idxs(ctx, ubatch);
2080
+ }
2081
+
2082
+ void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
2083
+ kv->set_input_k_shift(dst);
2084
+ }
2085
+
2086
+ void llama_kv_cache_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2087
+ kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
2088
+ }
2089
+
2090
+ void llama_kv_cache_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2091
+ kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
2092
+ }
2093
+
2094
+ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
2095
+ kv->set_input_kq_mask(dst, ubatch, causal_attn);
2096
+ }
2097
+
2098
+ void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
2099
+ kv->set_input_pos_bucket(dst, ubatch);
2100
+ }