whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,1154 @@
1
+ #include "llama-memory-recurrent.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-io.h"
5
+ #include "llama-batch.h"
6
+ #include "llama-model.h"
7
+
8
+ #include <algorithm>
9
+ #include <cassert>
10
+ #include <limits>
11
+ #include <map>
12
+ #include <stdexcept>
13
+
14
+ //
15
+ // llama_memory_recurrent
16
+ //
17
+
18
+ llama_memory_recurrent::llama_memory_recurrent(
19
+ const llama_model & model,
20
+ ggml_type type_r,
21
+ ggml_type type_s,
22
+ bool offload,
23
+ uint32_t mem_size,
24
+ uint32_t n_seq_max,
25
+ const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
26
+ const int32_t n_layer = hparams.n_layer;
27
+
28
+ head = 0;
29
+ size = mem_size;
30
+ used = 0;
31
+
32
+ cells.clear();
33
+ cells.resize(mem_size);
34
+
35
+ // create a context for each buffer type
36
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
37
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
38
+ auto it = ctx_map.find(buft);
39
+ if (it == ctx_map.end()) {
40
+ ggml_init_params params = {
41
+ /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
42
+ /*.mem_buffer =*/ NULL,
43
+ /*.no_alloc =*/ true,
44
+ };
45
+
46
+ ggml_context * ctx = ggml_init(params);
47
+ if (!ctx) {
48
+ return nullptr;
49
+ }
50
+
51
+ ctx_map[buft] = ctx;
52
+ ctxs.emplace_back(ctx);
53
+
54
+ return ctx;
55
+ }
56
+
57
+ return it->second;
58
+ };
59
+
60
+ r_l.resize(n_layer);
61
+ s_l.resize(n_layer);
62
+
63
+ for (int i = 0; i < n_layer; i++) {
64
+ if (filter && !filter(i)) {
65
+ LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, i);
66
+ continue;
67
+ }
68
+
69
+ const char * dev_name = "CPU";
70
+
71
+ ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
72
+
73
+ if (offload) {
74
+ auto * dev = model.dev_layer(i);
75
+ buft = ggml_backend_dev_buffer_type(dev);
76
+
77
+ dev_name = ggml_backend_dev_name(dev);
78
+ }
79
+
80
+ LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
81
+
82
+ ggml_context * ctx = ctx_for_buft(buft);
83
+ if (!ctx) {
84
+ throw std::runtime_error("failed to create ggml context for rs cache");
85
+ }
86
+
87
+ ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
88
+ ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
89
+ ggml_format_name(r, "cache_r_l%d", i);
90
+ ggml_format_name(s, "cache_s_l%d", i);
91
+ r_l[i] = r;
92
+ s_l[i] = s;
93
+ }
94
+
95
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
96
+ for (auto it : ctx_map) {
97
+ auto * buft = it.first;
98
+ auto * ctx = it.second;
99
+
100
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
101
+ if (!buf) {
102
+ throw std::runtime_error("failed to allocate buffer for rs cache");
103
+ }
104
+ ggml_backend_buffer_clear(buf, 0);
105
+ LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
106
+ bufs.emplace_back(buf);
107
+ }
108
+
109
+ {
110
+ const size_t memory_size_r = size_r_bytes();
111
+ const size_t memory_size_s = size_s_bytes();
112
+
113
+ LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
114
+ (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
115
+ ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
116
+ ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
117
+ }
118
+ }
119
+
120
+ void llama_memory_recurrent::clear(bool data) {
121
+ for (int32_t i = 0; i < (int32_t) size; ++i) {
122
+ cells[i].pos = -1;
123
+ cells[i].seq_id.clear();
124
+ cells[i].src = -1;
125
+ cells[i].tail = -1;
126
+ }
127
+
128
+ head = 0;
129
+ used = 0;
130
+
131
+ if (data) {
132
+ for (auto & buf : bufs) {
133
+ ggml_backend_buffer_clear(buf.get(), 0);
134
+ }
135
+ }
136
+ }
137
+
138
+ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
139
+ uint32_t new_head = size;
140
+
141
+ if (p0 < 0) {
142
+ p0 = 0;
143
+ }
144
+
145
+ if (p1 < 0) {
146
+ p1 = std::numeric_limits<llama_pos>::max();
147
+ }
148
+
149
+ // models like Mamba or RWKV can't have a state partially erased
150
+ if (seq_id >= (int64_t) size) {
151
+ // could be fatal
152
+ return false;
153
+ }
154
+ if (0 <= seq_id) {
155
+ int32_t & tail_id = cells[seq_id].tail;
156
+ if (tail_id >= 0) {
157
+ const auto & cell = cells[tail_id];
158
+ // partial intersection is invalid
159
+ if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
160
+ return false;
161
+ }
162
+ // invalidate tails which will be cleared
163
+ if (p0 <= cell.pos && cell.pos < p1) {
164
+ tail_id = -1;
165
+ }
166
+ }
167
+ } else {
168
+ // seq_id is negative, then the range should include everything or nothing
169
+ if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
170
+ return false;
171
+ }
172
+ }
173
+
174
+ for (uint32_t i = 0; i < size; ++i) {
175
+ if (cells[i].pos >= p0 && cells[i].pos < p1) {
176
+ if (seq_id < 0) {
177
+ cells[i].seq_id.clear();
178
+ } else if (cells[i].has_seq_id(seq_id)) {
179
+ cells[i].seq_id.erase(seq_id);
180
+ } else {
181
+ continue;
182
+ }
183
+ if (cells[i].is_empty()) {
184
+ // keep count of the number of used cells
185
+ if (cells[i].pos >= 0) {
186
+ used--;
187
+ }
188
+ cells[i].pos = -1;
189
+ cells[i].src = -1;
190
+ if (new_head == size) {
191
+ new_head = i;
192
+ }
193
+ }
194
+ }
195
+ }
196
+
197
+ // If we freed up a slot, set head to it so searching can start there.
198
+ if (new_head != size && new_head < head) {
199
+ head = new_head;
200
+ }
201
+
202
+ return true;
203
+ }
204
+
205
+ void llama_memory_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
206
+ if (seq_id_src == seq_id_dst) {
207
+ return;
208
+ }
209
+
210
+ if (p0 < 0) {
211
+ p0 = 0;
212
+ }
213
+
214
+ if (p1 < 0) {
215
+ p1 = std::numeric_limits<llama_pos>::max();
216
+ }
217
+
218
+ if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
219
+ auto & tail_src = cells[seq_id_src];
220
+ auto & tail_dst = cells[seq_id_dst];
221
+ if (tail_dst.tail >= 0) {
222
+ // clear destination seq_id if it wasn't empty
223
+ auto & cell_dst = cells[tail_dst.tail];
224
+
225
+ cell_dst.seq_id.erase(seq_id_dst);
226
+ tail_dst.tail = -1;
227
+ if (cell_dst.seq_id.empty()) {
228
+ cell_dst.pos = -1;
229
+ cell_dst.src = -1;
230
+ used -= 1;
231
+ }
232
+ }
233
+ if (tail_src.tail >= 0) {
234
+ auto & cell_src = cells[tail_src.tail];
235
+
236
+ cell_src.seq_id.insert(seq_id_dst);
237
+ tail_dst.tail = tail_src.tail;
238
+ }
239
+ }
240
+ }
241
+
242
+ void llama_memory_recurrent::seq_keep(llama_seq_id seq_id) {
243
+ uint32_t new_head = size;
244
+
245
+ for (uint32_t i = 0; i < size; ++i) {
246
+ if ((llama_seq_id) i != seq_id) {
247
+ cells[i].tail = -1;
248
+ }
249
+
250
+ if (!cells[i].has_seq_id(seq_id)) {
251
+ if (cells[i].pos >= 0) {
252
+ used--;
253
+ }
254
+
255
+ cells[i].pos = -1;
256
+ cells[i].src = -1;
257
+ cells[i].seq_id.clear();
258
+
259
+ if (new_head == size){
260
+ new_head = i;
261
+ }
262
+ } else {
263
+ cells[i].seq_id.clear();
264
+ cells[i].seq_id.insert(seq_id);
265
+ }
266
+ }
267
+
268
+ // If we freed up a slot, set head to it so searching can start there.
269
+ if (new_head != size && new_head < head) {
270
+ head = new_head;
271
+ }
272
+ }
273
+
274
+ void llama_memory_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
275
+ if (shift == 0) {
276
+ return;
277
+ }
278
+
279
+ if (p0 < 0) {
280
+ p0 = 0;
281
+ }
282
+
283
+ if (p1 < 0) {
284
+ p1 = std::numeric_limits<llama_pos>::max();
285
+ }
286
+
287
+ // If there is no range then return early to avoid looping over the
288
+ if (p0 == p1) {
289
+ return;
290
+ }
291
+
292
+ // for Mamba-like or RWKV models, only the pos needs to be shifted
293
+ if (0 <= seq_id && seq_id < (int64_t) size) {
294
+ const int32_t tail_id = cells[seq_id].tail;
295
+ if (tail_id >= 0) {
296
+ auto & cell = cells[tail_id];
297
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
298
+ cell.pos += shift;
299
+ }
300
+ }
301
+ }
302
+ }
303
+
304
+ void llama_memory_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
305
+ if (d == 1) {
306
+ return;
307
+ }
308
+
309
+ if (p0 < 0) {
310
+ p0 = 0;
311
+ }
312
+
313
+ if (p1 < 0) {
314
+ p1 = std::numeric_limits<llama_pos>::max();
315
+ }
316
+
317
+ // If there is no range then return early to avoid looping over the cache.
318
+ if (p0 == p1) {
319
+ return;
320
+ }
321
+
322
+ // for Mamba-like or RWKV models, only the pos needs to be changed
323
+ if (0 <= seq_id && seq_id < (int64_t) size) {
324
+ const int32_t tail_id = cells[seq_id].tail;
325
+ if (tail_id >= 0) {
326
+ auto & cell = cells[tail_id];
327
+ if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
328
+ cell.pos /= d;
329
+ }
330
+ }
331
+ }
332
+ }
333
+
334
+ llama_pos llama_memory_recurrent::seq_pos_min(llama_seq_id seq_id) const {
335
+ llama_pos result = std::numeric_limits<llama_pos>::max();
336
+
337
+ for (uint32_t i = 0; i < size; ++i) {
338
+ if (cells[i].has_seq_id(seq_id)) {
339
+ result = std::min(result, cells[i].pos);
340
+ }
341
+ }
342
+
343
+ if (result == std::numeric_limits<llama_pos>::max()) {
344
+ result = -1;
345
+ }
346
+
347
+ return result;
348
+ }
349
+
350
+ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
351
+ llama_pos result = -1;
352
+
353
+ for (uint32_t i = 0; i < size; ++i) {
354
+ if (cells[i].has_seq_id(seq_id)) {
355
+ result = std::max(result, cells[i].pos);
356
+ }
357
+ }
358
+
359
+ return result;
360
+ }
361
+
362
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
363
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
364
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
365
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
366
+ }
367
+ return ret;
368
+ }
369
+
370
+ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
371
+ do {
372
+ balloc.split_reset();
373
+
374
+ std::vector<llama_ubatch> ubatches;
375
+ while (true) {
376
+ llama_ubatch ubatch;
377
+
378
+ if (embd_all) {
379
+ // if all tokens are output, split by sequence
380
+ ubatch = balloc.split_seq(n_ubatch);
381
+ } else {
382
+ ubatch = balloc.split_equal(n_ubatch, false);
383
+ }
384
+
385
+ if (ubatch.n_tokens == 0) {
386
+ break;
387
+ }
388
+
389
+ ubatches.push_back(std::move(ubatch)); // NOLINT
390
+ }
391
+
392
+ if (balloc.get_n_used() < balloc.get_n_tokens()) {
393
+ // failed to find a suitable split
394
+ break;
395
+ }
396
+
397
+ if (!prepare(ubatches)) {
398
+ break;
399
+ }
400
+
401
+ return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
402
+ } while (false);
403
+
404
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
405
+ }
406
+
407
+ llama_memory_context_ptr llama_memory_recurrent::init_full() {
408
+ return std::make_unique<llama_memory_recurrent_context>(this);
409
+ }
410
+
411
+ llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
412
+ GGML_UNUSED(lctx);
413
+ GGML_UNUSED(optimize);
414
+
415
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_NO_UPDATE);
416
+ }
417
+
418
+ bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
419
+ // simply remember the full state because it is very small for this type of cache
420
+ // TODO: optimize
421
+ auto org_cells = cells;
422
+ auto org_used = used;
423
+ auto org_head = head;
424
+
425
+ bool success = true;
426
+
427
+ for (const auto & ubatch : ubatches) {
428
+ if (!find_slot(ubatch)) {
429
+ success = false;
430
+ break;
431
+ }
432
+ }
433
+
434
+ // restore the original state
435
+ cells = std::move(org_cells);
436
+ used = org_used;
437
+ head = org_head;
438
+
439
+ return success;
440
+ }
441
+
442
+ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
443
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
444
+ const uint32_t n_seqs = ubatch.n_seqs;
445
+
446
+ // if we have enough unused cells before the current head ->
447
+ // better to start searching from the beginning of the cache, hoping to fill it
448
+ if (head > used + 2*n_seqs) {
449
+ head = 0;
450
+ }
451
+
452
+ // For recurrent state architectures (like Mamba or RWKV),
453
+ // each cache cell can store the state for a whole sequence.
454
+ // A slot should be always be contiguous.
455
+
456
+ // can only process batches with an equal number of new tokens in each sequence
457
+ GGML_ASSERT(ubatch.equal_seqs());
458
+
459
+ int32_t min = size - 1;
460
+ int32_t max = 0;
461
+
462
+ // everything should fit if all seq_ids are smaller than the max
463
+ for (uint32_t s = 0; s < n_seqs; ++s) {
464
+ const uint32_t i = s*n_seq_tokens; // first token of sequence set s
465
+ const uint32_t n_seq_id = ubatch.n_seq_id[i];
466
+
467
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
468
+ const llama_seq_id seq_id = ubatch.seq_id[i][j];
469
+
470
+ if (seq_id < 0 || (uint32_t) seq_id >= size) {
471
+ // too big seq_id
472
+ // TODO: would it be possible to resize the cache instead?
473
+ LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
474
+ return false;
475
+ }
476
+ if (j > 0) {
477
+ auto & seq = cells[seq_id];
478
+ if (seq.tail >= 0) {
479
+ auto & cell = cells[seq.tail];
480
+ // clear cells from seq_ids that become shared
481
+ // (should not normally happen, but let's handle it anyway)
482
+ cell.seq_id.erase(seq_id);
483
+ seq.tail = -1;
484
+ if (cell.seq_id.empty()) {
485
+ cell.pos = -1;
486
+ cell.src = -1;
487
+ used -= 1;
488
+ }
489
+ }
490
+ }
491
+ }
492
+ }
493
+
494
+ #ifndef NDEBUG
495
+ {
496
+ std::vector<int32_t> tails_verif;
497
+ tails_verif.assign(size, -1);
498
+ for (uint32_t i = 0; i < size; ++i) {
499
+ auto & cell = cells[i];
500
+ for (llama_seq_id seq_id : cell.seq_id) {
501
+ if (tails_verif[seq_id] != -1) {
502
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
503
+ }
504
+ tails_verif[seq_id] = i;
505
+ }
506
+ }
507
+ for (uint32_t i = 0; i < size; ++i) {
508
+ if (tails_verif[i] != cells[i].tail) {
509
+ LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
510
+ }
511
+ }
512
+ }
513
+ #endif
514
+
515
+ // find next empty cell
516
+ uint32_t next_empty_cell = head;
517
+
518
+ for (uint32_t i = 0; i < size; ++i) {
519
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
520
+ auto & cell = cells[next_empty_cell];
521
+ if (cell.is_empty()) { break; }
522
+ next_empty_cell += 1;
523
+ }
524
+
525
+ // find usable cell range
526
+ for (uint32_t s = 0; s < n_seqs; ++s) {
527
+ const uint32_t i = s*n_seq_tokens;
528
+ const llama_seq_id seq_id = ubatch.seq_id[i][0];
529
+ auto & seq_meta = cells[seq_id];
530
+ bool has_cell = false;
531
+ if (seq_meta.tail >= 0) {
532
+ auto & cell = cells[seq_meta.tail];
533
+ GGML_ASSERT(cell.has_seq_id(seq_id));
534
+ // does this seq_id "own" the cell?
535
+ if (cell.seq_id.size() == 1) { has_cell = true; }
536
+ }
537
+ if (!has_cell) {
538
+ auto & empty_cell = cells[next_empty_cell];
539
+ GGML_ASSERT(empty_cell.is_empty());
540
+ // copy old tail into the empty cell
541
+ if (seq_meta.tail >= 0) {
542
+ auto & orig_cell = cells[seq_meta.tail];
543
+ empty_cell.pos = orig_cell.pos;
544
+ empty_cell.src = orig_cell.src;
545
+ orig_cell.seq_id.erase(seq_id);
546
+ empty_cell.seq_id.insert(seq_id); // will be overwritten
547
+ GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
548
+ }
549
+ seq_meta.tail = next_empty_cell;
550
+ // find next empty cell
551
+ if (s + 1 < n_seqs) {
552
+ for (uint32_t j = 0; j < size; ++j) {
553
+ next_empty_cell += 1;
554
+ if (next_empty_cell >= size) { next_empty_cell -= size; }
555
+ auto & cell = cells[next_empty_cell];
556
+ if (cell.is_empty()) { break; }
557
+ }
558
+ }
559
+ }
560
+ if (min > seq_meta.tail) { min = seq_meta.tail; }
561
+ if (max < seq_meta.tail) { max = seq_meta.tail; }
562
+ }
563
+
564
+ // gather and re-order
565
+ for (uint32_t s = 0; s < n_seqs; ++s) {
566
+ const uint32_t i = s*n_seq_tokens;
567
+ const int32_t dst_id = s + min;
568
+ const int32_t src_id = cells[ubatch.seq_id[i][0]].tail;
569
+ if (dst_id != src_id) {
570
+ auto & dst_cell = cells[dst_id];
571
+ auto & src_cell = cells[src_id];
572
+
573
+ std::swap(dst_cell.pos, src_cell.pos);
574
+ std::swap(dst_cell.src, src_cell.src);
575
+ std::swap(dst_cell.seq_id, src_cell.seq_id);
576
+
577
+ // swap tails
578
+ for (uint32_t j = 0; j < size; ++j) {
579
+ int32_t & tail = cells[j].tail;
580
+ if (tail == src_id) {
581
+ tail = dst_id;
582
+ } else if (tail == dst_id) {
583
+ tail = src_id;
584
+ }
585
+ }
586
+ }
587
+ }
588
+
589
+ // update the pos of the used seqs
590
+ for (uint32_t s = 0; s < n_seqs; ++s) {
591
+ const uint32_t i = s*n_seq_tokens;
592
+ const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
593
+ const int32_t cell_id = s + min;
594
+ auto & cell = cells[cell_id];
595
+
596
+ if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
597
+ // What should happen when the pos backtracks or skips a value?
598
+ // Clearing the state mid-batch would require special-casing which isn't done.
599
+ LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
600
+ __func__, last_pos, cell.pos, ubatch.seq_id[i][0], n_seq_tokens);
601
+ }
602
+ cell.pos = last_pos;
603
+ cell.seq_id.clear();
604
+ for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
605
+ const llama_seq_id seq_id = ubatch.seq_id[i][j];
606
+ cell.seq_id.insert(seq_id);
607
+ cells[seq_id].tail = cell_id;
608
+ }
609
+ }
610
+
611
+ // Find first cell without src refs, to use as the zero-ed state
612
+ {
613
+ // TODO: bake-in src refcounts in the cell metadata
614
+ std::vector<int32_t> refcounts(size, 0);
615
+ for (size_t i = 0; i < size; ++i) {
616
+ const int32_t src = cells[i].src;
617
+ if (src >= 0) {
618
+ refcounts[src] += 1;
619
+ }
620
+ }
621
+
622
+ rs_z = -1;
623
+ for (int i = min; i <= max; ++i) {
624
+ if (refcounts[i] == 0) {
625
+ rs_z = i;
626
+ break;
627
+ }
628
+ }
629
+
630
+ for (int i = min; i <= max; ++i) {
631
+ if (cells[i].src < 0) {
632
+ GGML_ASSERT(rs_z >= 0);
633
+ cells[i].src0 = rs_z;
634
+ } else {
635
+ // Stage the source ids for all used cells to allow correct seq_* behavior
636
+ // and still make these values available when setting the inputs
637
+ cells[i].src0 = cells[i].src;
638
+ }
639
+ cells[i].src = i; // avoid moving or clearing twice
640
+ }
641
+ }
642
+
643
+ // allow getting the range of used cells, from head to head + n
644
+ head = min;
645
+ n = max - min + 1;
646
+ used = std::count_if(cells.begin(), cells.end(),
647
+ [](const mem_cell & cell){ return !cell.is_empty(); });
648
+
649
+ // sanity check
650
+ return n >= n_seqs;
651
+ }
652
+
653
+ bool llama_memory_recurrent::get_can_shift() const {
654
+ // shifting the pos is trivial for recurrent models
655
+ return true;
656
+ }
657
+
658
+ size_t llama_memory_recurrent::total_size() const {
659
+ size_t size = 0;
660
+ for (const auto & buf : bufs) {
661
+ size += ggml_backend_buffer_get_size(buf.get());
662
+ }
663
+
664
+ return size;
665
+ }
666
+
667
+ size_t llama_memory_recurrent::size_r_bytes() const {
668
+ size_t size_r_bytes = 0;
669
+
670
+ for (const auto & r : r_l) {
671
+ if (r != nullptr) {
672
+ size_r_bytes += ggml_nbytes(r);
673
+ }
674
+ }
675
+
676
+ return size_r_bytes;
677
+ }
678
+
679
+ size_t llama_memory_recurrent::size_s_bytes() const {
680
+ size_t size_s_bytes = 0;
681
+
682
+ for (const auto & s : s_l) {
683
+ if (s != nullptr) {
684
+ size_s_bytes += ggml_nbytes(s);
685
+ }
686
+ }
687
+
688
+ return size_s_bytes;
689
+ }
690
+
691
+ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
692
+ GGML_UNUSED(flags);
693
+
694
+ std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
695
+ uint32_t cell_count = 0;
696
+
697
+ // Count the number of cells with the specified seq_id
698
+ // Find all the ranges of cells with this seq id (or all, when -1)
699
+ uint32_t cell_range_begin = size;
700
+ for (uint32_t i = 0; i < size; ++i) {
701
+ const auto & cell = cells[i];
702
+ if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
703
+ ++cell_count;
704
+ if (cell_range_begin == size) {
705
+ cell_range_begin = i;
706
+ }
707
+ } else {
708
+ if (cell_range_begin != size) {
709
+ cell_ranges.emplace_back(cell_range_begin, i);
710
+ cell_range_begin = size;
711
+ }
712
+ }
713
+ }
714
+ if (cell_range_begin != size) {
715
+ cell_ranges.emplace_back(cell_range_begin, size);
716
+ }
717
+
718
+ // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
719
+ uint32_t cell_count_check = 0;
720
+ for (const auto & range : cell_ranges) {
721
+ cell_count_check += range.second - range.first;
722
+ }
723
+ GGML_ASSERT(cell_count == cell_count_check);
724
+
725
+ io.write(&cell_count, sizeof(cell_count));
726
+
727
+ state_write_meta(io, cell_ranges, seq_id);
728
+ state_write_data(io, cell_ranges);
729
+ }
730
+
731
+ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
732
+ GGML_UNUSED(flags);
733
+
734
+ uint32_t cell_count;
735
+ io.read_to(&cell_count, sizeof(cell_count));
736
+
737
+ bool res = true;
738
+
739
+ res = res && state_read_meta(io, cell_count, seq_id);
740
+ res = res && state_read_data(io, cell_count);
741
+
742
+ if (!res) {
743
+ if (seq_id == -1) {
744
+ clear(true);
745
+ } else {
746
+ seq_rm(seq_id, -1, -1);
747
+ }
748
+ throw std::runtime_error("failed to restore kv cache");
749
+ }
750
+ }
751
+
752
+ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
753
+ for (const auto & range : cell_ranges) {
754
+ for (uint32_t i = range.first; i < range.second; ++i) {
755
+ const auto & cell = cells[i];
756
+ const llama_pos pos = cell.pos;
757
+ const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
758
+
759
+ io.write(&pos, sizeof(pos));
760
+ io.write(&n_seq_id, sizeof(n_seq_id));
761
+
762
+ if (n_seq_id) {
763
+ for (auto seq_id : cell.seq_id) {
764
+ io.write(&seq_id, sizeof(seq_id));
765
+ }
766
+ }
767
+ }
768
+ }
769
+ }
770
+
771
+ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
772
+ const uint32_t s_trans = 0;
773
+ const uint32_t n_layer = hparams.n_layer;
774
+
775
+ io.write(&s_trans, sizeof(s_trans));
776
+ io.write(&n_layer, sizeof(n_layer));
777
+
778
+ std::vector<uint8_t> tmp_buf;
779
+
780
+ // Iterate and write all the keys first, each row is a cell
781
+ // Get whole range at a time
782
+ for (uint32_t il = 0; il < n_layer; ++il) {
783
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
784
+ if (r_l[il] == nullptr) continue;
785
+
786
+ // Write key type
787
+ const int32_t r_type_i = (int32_t)r_l[il]->type;
788
+ io.write(&r_type_i, sizeof(r_type_i));
789
+
790
+ // Write row size of key
791
+ const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
792
+ io.write(&r_size_row, sizeof(r_size_row));
793
+
794
+ // Read each range of cells of k_size length each into tmp_buf and write out
795
+ for (const auto & range : cell_ranges) {
796
+ const size_t range_size = range.second - range.first;
797
+ const size_t buf_size = range_size * r_size_row;
798
+ io.write_tensor(r_l[il], range.first * r_size_row, buf_size);
799
+ }
800
+ }
801
+
802
+ if (!s_trans) {
803
+ for (uint32_t il = 0; il < n_layer; ++il) {
804
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
805
+ if (s_l[il] == nullptr) continue;
806
+
807
+ // Write value type
808
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
809
+ io.write(&s_type_i, sizeof(s_type_i));
810
+
811
+ // Write row size of value
812
+ const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
813
+ io.write(&s_size_row, sizeof(s_size_row));
814
+
815
+ // Read each range of cells of s_size length each into tmp_buf and write out
816
+ for (const auto & range : cell_ranges) {
817
+ const size_t range_size = range.second - range.first;
818
+ const size_t buf_size = range_size * s_size_row;
819
+ io.write_tensor(s_l[il], range.first * s_size_row, buf_size);
820
+ }
821
+ }
822
+ } else {
823
+ // When v is transposed, we also need the element size and get the element ranges from each row
824
+ const uint32_t mem_size = size;
825
+ for (uint32_t il = 0; il < n_layer; ++il) {
826
+ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
827
+ if (s_l[il] == nullptr) continue;
828
+
829
+ const uint32_t n_embd_s = hparams.n_embd_s();
830
+
831
+ // Write value type
832
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
833
+ io.write(&s_type_i, sizeof(s_type_i));
834
+
835
+ // Write element size
836
+ const uint32_t s_size_el = ggml_type_size(s_l[il]->type);
837
+ io.write(&s_size_el, sizeof(s_size_el));
838
+
839
+ // Write GQA embedding size
840
+ io.write(&n_embd_s, sizeof(n_embd_s));
841
+
842
+ // For each row, we get the element values of each cell
843
+ for (uint32_t j = 0; j < n_embd_s; ++j) {
844
+ // Read each range of cells of v_size_el length each into tmp_buf and write out
845
+ for (const auto & range : cell_ranges) {
846
+ const size_t range_size = range.second - range.first;
847
+ const size_t src_offset = (range.first + j * mem_size) * s_size_el;
848
+ const size_t buf_size = range_size * s_size_el;
849
+ io.write_tensor(s_l[il], src_offset, buf_size);
850
+ }
851
+ }
852
+ }
853
+ }
854
+ }
855
+
856
+ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
857
+ if (dest_seq_id != -1) {
858
+ // single sequence
859
+
860
+ seq_rm(dest_seq_id, -1, -1);
861
+
862
+ llama_batch_allocr balloc(hparams.n_pos_per_embd());
863
+
864
+ llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
865
+
866
+ for (uint32_t i = 0; i < cell_count; ++i) {
867
+ llama_pos pos;
868
+ uint32_t n_seq_id;
869
+
870
+ io.read_to(&pos, sizeof(pos));
871
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
872
+
873
+ if (n_seq_id != 0) {
874
+ LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
875
+ return false;
876
+ }
877
+
878
+ ubatch.pos[i] = pos;
879
+ }
880
+ ubatch.n_seq_id[0] = 1;
881
+ ubatch.seq_id[0] = &dest_seq_id;
882
+
883
+ if (!find_slot(ubatch)) {
884
+ LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
885
+ return false;
886
+ }
887
+
888
+ // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
889
+ // Assume that this is one contiguous block of cells
890
+ GGML_ASSERT(head + cell_count <= size);
891
+ GGML_ASSERT(cells[head].pos == ubatch.pos[0]);
892
+ GGML_ASSERT(cells[head + cell_count - 1].pos == ubatch.pos[cell_count - 1]);
893
+ GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
894
+ GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
895
+ } else {
896
+ // whole KV cache restore
897
+
898
+ if (cell_count > size) {
899
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
900
+ return false;
901
+ }
902
+
903
+ clear(true);
904
+
905
+ for (uint32_t i = 0; i < cell_count; ++i) {
906
+ auto & cell = cells[i];
907
+
908
+ llama_pos pos;
909
+ uint32_t n_seq_id;
910
+
911
+ io.read_to(&pos, sizeof(pos));
912
+ io.read_to(&n_seq_id, sizeof(n_seq_id));
913
+
914
+ cell.pos = pos;
915
+
916
+ for (uint32_t j = 0; j < n_seq_id; ++j) {
917
+ llama_seq_id seq_id;
918
+ io.read_to(&seq_id, sizeof(seq_id));
919
+
920
+ // TODO: llama_memory_recurrent should have a notion of max sequences
921
+ //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
922
+ if (seq_id < 0) {
923
+ //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
924
+ LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
925
+ return false;
926
+ }
927
+
928
+ cell.seq_id.insert(seq_id);
929
+
930
+ int32_t & tail = cells[seq_id].tail;
931
+ if (tail != -1) {
932
+ LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
933
+ return false;
934
+ }
935
+ tail = i;
936
+ }
937
+ }
938
+
939
+ head = 0;
940
+ used = cell_count;
941
+ }
942
+
943
+ for (uint32_t i = 0; i < cell_count; ++i) {
944
+ uint32_t cell_id = head + i;
945
+ // make sure the recurrent states will keep their restored state
946
+ cells[cell_id].src = cell_id;
947
+ }
948
+
949
+ return true;
950
+ }
951
+
952
+ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
953
+ uint32_t s_trans;
954
+ uint32_t n_layer;
955
+ io.read_to(&s_trans, sizeof(s_trans));
956
+ io.read_to(&n_layer, sizeof(n_layer));
957
+
958
+ if (n_layer != hparams.n_layer) {
959
+ LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
960
+ return false;
961
+ }
962
+ if (cell_count > size) {
963
+ LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
964
+ return false;
965
+ }
966
+ if (false != (bool) s_trans) {
967
+ LLAMA_LOG_ERROR("%s: incompatible s transposition\n", __func__);
968
+ return false;
969
+ }
970
+
971
+ // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
972
+ for (uint32_t il = 0; il < n_layer; ++il) {
973
+ // skip null layers
974
+ if (r_l[il] == nullptr) continue;
975
+
976
+ // Read type of key
977
+ int32_t r_type_i_ref;
978
+ io.read_to(&r_type_i_ref, sizeof(r_type_i_ref));
979
+ const int32_t r_type_i = (int32_t) r_l[il]->type;
980
+ if (r_type_i != r_type_i_ref) {
981
+ LLAMA_LOG_ERROR("%s: mismatched r type (%d != %d, layer %d)\n", __func__, r_type_i, r_type_i_ref, il);
982
+ return false;
983
+ }
984
+
985
+ // Read row size of key
986
+ uint64_t r_size_row_ref;
987
+ io.read_to(&r_size_row_ref, sizeof(r_size_row_ref));
988
+ const size_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
989
+ if (r_size_row != r_size_row_ref) {
990
+ LLAMA_LOG_ERROR("%s: mismatched r row size (%zu != %zu, layer %d)\n", __func__, r_size_row, (size_t) r_size_row_ref, il);
991
+ return false;
992
+ }
993
+
994
+ if (cell_count) {
995
+ // Read and set the keys for the whole cell range
996
+ ggml_backend_tensor_set(r_l[il], io.read(cell_count * r_size_row), head * r_size_row, cell_count * r_size_row);
997
+ }
998
+ }
999
+
1000
+ if (!s_trans) {
1001
+ for (uint32_t il = 0; il < n_layer; ++il) {
1002
+ // skip null layers
1003
+ if (s_l[il] == nullptr) continue;
1004
+
1005
+ // Read type of value
1006
+ int32_t s_type_i_ref;
1007
+ io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
1008
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
1009
+
1010
+ if (s_type_i != s_type_i_ref) {
1011
+ LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
1012
+ return false;
1013
+ }
1014
+
1015
+ // Read row size of value
1016
+ uint64_t s_size_row_ref;
1017
+ io.read_to(&s_size_row_ref, sizeof(s_size_row_ref));
1018
+ const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
1019
+ if (s_size_row != s_size_row_ref) {
1020
+ LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il);
1021
+ return false;
1022
+ }
1023
+
1024
+ if (cell_count) {
1025
+ // Read and set the values for the whole cell range
1026
+ ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_row), head * s_size_row, cell_count * s_size_row);
1027
+ }
1028
+ }
1029
+ } else {
1030
+ // For each layer, read the values for each cell (transposed)
1031
+ for (uint32_t il = 0; il < n_layer; ++il) {
1032
+ // skip null layers
1033
+ if (s_l[il] == nullptr) continue;
1034
+
1035
+ const uint32_t n_embd_s = hparams.n_embd_s();
1036
+
1037
+ // Read type of value
1038
+ int32_t s_type_i_ref;
1039
+ io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
1040
+ const int32_t s_type_i = (int32_t)s_l[il]->type;
1041
+ if (s_type_i != s_type_i_ref) {
1042
+ LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
1043
+ return false;
1044
+ }
1045
+
1046
+ // Read element size of value
1047
+ uint32_t s_size_el_ref;
1048
+ io.read_to(&s_size_el_ref, sizeof(s_size_el_ref));
1049
+ const size_t s_size_el = ggml_type_size(s_l[il]->type);
1050
+ if (s_size_el != s_size_el_ref) {
1051
+ LLAMA_LOG_ERROR("%s: mismatched s element size (%zu != %zu, layer %d)\n", __func__, s_size_el, (size_t) s_size_el_ref, il);
1052
+ return false;
1053
+ }
1054
+
1055
+ // Read state embedding size
1056
+ uint32_t n_embd_s_ref;
1057
+ io.read_to(&n_embd_s_ref, sizeof(n_embd_s_ref));
1058
+ if (n_embd_s != n_embd_s_ref) {
1059
+ LLAMA_LOG_ERROR("%s: mismatched s embedding size (%u != %u, layer %d)\n", __func__, n_embd_s, n_embd_s_ref, il);
1060
+ return false;
1061
+ }
1062
+
1063
+ if (cell_count) {
1064
+ // For each row in the transposed matrix, read the values for the whole cell range
1065
+ for (uint32_t j = 0; j < n_embd_s; ++j) {
1066
+ const size_t dst_offset = (head + j * size) * s_size_el;
1067
+ ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_el), dst_offset, cell_count * s_size_el);
1068
+ }
1069
+ }
1070
+ }
1071
+ }
1072
+
1073
+ return true;
1074
+ }
1075
+
1076
+ //
1077
+ // llama_memory_recurrent_context
1078
+ //
1079
+
1080
+ llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
1081
+
1082
+ llama_memory_recurrent_context::llama_memory_recurrent_context(
1083
+ llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
1084
+ }
1085
+
1086
+ llama_memory_recurrent_context::llama_memory_recurrent_context(
1087
+ llama_memory_recurrent * mem,
1088
+ std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
1089
+
1090
+ llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
1091
+
1092
+ bool llama_memory_recurrent_context::next() {
1093
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1094
+
1095
+ if (++i_next >= ubatches.size()) {
1096
+ return false;
1097
+ }
1098
+
1099
+ return true;
1100
+ }
1101
+
1102
+ bool llama_memory_recurrent_context::apply() {
1103
+ assert(!llama_memory_status_is_fail(status));
1104
+
1105
+ // no ubatches -> this is an update
1106
+ if (ubatches.empty()) {
1107
+ // recurrent cache never performs updates
1108
+ assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
1109
+
1110
+ return true;
1111
+ }
1112
+
1113
+ mem->find_slot(ubatches[i_next]);
1114
+
1115
+ return true;
1116
+ }
1117
+
1118
+ llama_memory_status llama_memory_recurrent_context::get_status() const {
1119
+ return status;
1120
+ }
1121
+
1122
+ const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
1123
+ assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
1124
+
1125
+ return ubatches[i_next];
1126
+ }
1127
+
1128
+ uint32_t llama_memory_recurrent_context::get_n_rs() const {
1129
+ return is_full ? mem->size : mem->n;
1130
+ }
1131
+
1132
+ uint32_t llama_memory_recurrent_context::get_head() const {
1133
+ return is_full ? 0 : mem->head;
1134
+ }
1135
+
1136
+ int32_t llama_memory_recurrent_context::get_rs_z() const {
1137
+ return is_full ? 0 : mem->rs_z;
1138
+ }
1139
+
1140
+ uint32_t llama_memory_recurrent_context::get_size() const {
1141
+ return mem->size;
1142
+ }
1143
+
1144
+ ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
1145
+ return mem->r_l[il];
1146
+ }
1147
+
1148
+ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
1149
+ return mem->s_l[il];
1150
+ }
1151
+
1152
+ int32_t llama_memory_recurrent_context::s_copy(int i) const {
1153
+ return mem->cells[i + mem->head].src0;
1154
+ }