whispercpp 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (787) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -0
  3. data/LICENSE +1 -1
  4. data/README.md +216 -424
  5. data/Rakefile +79 -11
  6. data/ext/.gitignore +11 -0
  7. data/ext/dependencies.rb +61 -0
  8. data/ext/extconf.rb +18 -26
  9. data/ext/options.rb +221 -0
  10. data/ext/ruby_whisper.c +159 -0
  11. data/ext/ruby_whisper.h +27 -2
  12. data/ext/ruby_whisper_context.c +641 -0
  13. data/ext/ruby_whisper_error.c +52 -0
  14. data/ext/ruby_whisper_model.c +232 -0
  15. data/ext/ruby_whisper_params.c +1301 -0
  16. data/ext/ruby_whisper_segment.c +143 -0
  17. data/ext/ruby_whisper_transcribe.cpp +87 -0
  18. data/ext/ruby_whisper_vad_params.c +288 -0
  19. data/ext/sources/.dockerignore +3 -0
  20. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  21. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  22. data/ext/sources/CMakeLists.txt +251 -0
  23. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  24. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  25. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  26. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  27. data/ext/sources/bindings/javascript/package.json +26 -0
  28. data/ext/sources/bindings/javascript/whisper.js +19 -0
  29. data/ext/sources/build-xcframework.sh +547 -0
  30. data/ext/sources/ci/run.sh +336 -0
  31. data/ext/sources/close-issue.yml +28 -0
  32. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  33. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  34. data/ext/sources/cmake/build-info.cmake +60 -0
  35. data/ext/sources/cmake/git-vars.cmake +22 -0
  36. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  37. data/ext/sources/cmake/whisper.pc.in +10 -0
  38. data/ext/sources/examples/CMakeLists.txt +124 -0
  39. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  40. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  41. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  42. data/ext/sources/examples/addon.node/index.js +54 -0
  43. data/ext/sources/examples/addon.node/package.json +16 -0
  44. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  45. data/ext/sources/examples/bench/bench.cpp +175 -0
  46. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  47. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  48. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  49. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  50. data/ext/sources/examples/cli/cli.cpp +1294 -0
  51. data/ext/sources/examples/coi-serviceworker.js +146 -0
  52. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  53. data/ext/sources/examples/command/command.cpp +776 -0
  54. data/ext/sources/examples/command/commands.txt +9 -0
  55. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  56. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  57. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  58. data/ext/sources/examples/common-ggml.cpp +238 -0
  59. data/ext/sources/examples/common-ggml.h +18 -0
  60. data/ext/sources/examples/common-sdl.cpp +227 -0
  61. data/ext/sources/examples/common-sdl.h +49 -0
  62. data/ext/sources/examples/common-whisper.cpp +168 -0
  63. data/ext/sources/examples/common-whisper.h +24 -0
  64. data/ext/sources/examples/common.cpp +675 -0
  65. data/ext/sources/examples/common.h +322 -0
  66. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  67. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  68. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  69. data/ext/sources/examples/generate-karaoke.sh +57 -0
  70. data/ext/sources/examples/grammar-parser.cpp +423 -0
  71. data/ext/sources/examples/grammar-parser.h +29 -0
  72. data/ext/sources/examples/helpers.js +191 -0
  73. data/ext/sources/examples/json.hpp +24596 -0
  74. data/ext/sources/examples/livestream.sh +112 -0
  75. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  76. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  77. data/ext/sources/examples/lsp/whisper.vim +362 -0
  78. data/ext/sources/examples/miniaudio.h +93468 -0
  79. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  80. data/ext/sources/examples/python/whisper_processor.py +54 -0
  81. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  82. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  83. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  84. data/ext/sources/examples/server/bench.js +29 -0
  85. data/ext/sources/examples/server/httplib.h +10497 -0
  86. data/ext/sources/examples/server/server.cpp +1091 -0
  87. data/ext/sources/examples/server.py +115 -0
  88. data/ext/sources/examples/stb_vorbis.c +5584 -0
  89. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  90. data/ext/sources/examples/stream/stream.cpp +429 -0
  91. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  92. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  93. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  94. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  95. data/ext/sources/examples/sycl/build.sh +22 -0
  96. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  97. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  98. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  99. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  101. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  103. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  105. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  107. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  108. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  109. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  111. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  113. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  115. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  117. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  119. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  120. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  121. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  124. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  126. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  128. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  130. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  132. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  133. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  134. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  136. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  138. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  140. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  141. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  142. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  143. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  144. data/ext/sources/examples/talk-llama/speak +40 -0
  145. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  146. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  147. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  149. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  150. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  151. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  152. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  153. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  154. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  155. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  157. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  159. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  160. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  162. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  163. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  164. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  165. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  166. data/ext/sources/ggml/CMakeLists.txt +390 -0
  167. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  168. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  169. data/ext/sources/ggml/cmake/common.cmake +26 -0
  170. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  171. data/ext/sources/ggml/include/ggml-alloc.h +76 -0
  172. data/ext/sources/ggml/include/ggml-backend.h +354 -0
  173. data/ext/sources/ggml/include/ggml-blas.h +25 -0
  174. data/ext/sources/ggml/include/ggml-cann.h +123 -0
  175. data/ext/sources/ggml/include/ggml-cpp.h +39 -0
  176. data/ext/sources/ggml/include/ggml-cpu.h +143 -0
  177. data/ext/sources/ggml/include/ggml-cuda.h +47 -0
  178. data/ext/sources/ggml/include/ggml-kompute.h +50 -0
  179. data/ext/sources/ggml/include/ggml-metal.h +66 -0
  180. data/ext/sources/ggml/include/ggml-opencl.h +26 -0
  181. data/ext/sources/ggml/include/ggml-opt.h +237 -0
  182. data/ext/sources/ggml/include/ggml-rpc.h +33 -0
  183. data/ext/sources/ggml/include/ggml-sycl.h +49 -0
  184. data/ext/sources/ggml/include/ggml-vulkan.h +29 -0
  185. data/ext/{ggml.h → sources/ggml/include/ggml.h} +621 -821
  186. data/ext/sources/ggml/include/gguf.h +202 -0
  187. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  188. data/ext/sources/ggml/src/ggml-alloc.c +1042 -0
  189. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  190. data/ext/sources/ggml/src/ggml-amx/common.h +94 -0
  191. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  192. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +2510 -0
  193. data/ext/sources/ggml/src/ggml-amx/mmq.h +17 -0
  194. data/ext/sources/ggml/src/ggml-backend-impl.h +255 -0
  195. data/ext/sources/ggml/src/ggml-backend-reg.cpp +586 -0
  196. data/ext/sources/ggml/src/ggml-backend.cpp +2011 -0
  197. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  198. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  199. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  200. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  201. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +181 -0
  202. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +258 -0
  203. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +3193 -0
  204. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  205. data/ext/sources/ggml/src/ggml-cann/common.h +420 -0
  206. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +2606 -0
  207. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  208. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  209. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +234 -0
  210. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  211. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  212. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  213. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  214. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  215. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  216. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  217. data/ext/sources/ggml/src/ggml-common.h +1857 -0
  218. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  219. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
  220. data/ext/sources/ggml/src/ggml-cpu/amx/amx.h +8 -0
  221. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +91 -0
  222. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  223. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  224. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  225. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  226. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  227. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  228. data/ext/sources/ggml/src/ggml-cpu/cpu-feats-x86.cpp +327 -0
  229. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  232. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +508 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +13747 -0
  235. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  236. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  237. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  238. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
  240. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  241. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  244. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  246. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  247. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  248. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  249. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  250. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  251. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  252. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  253. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  254. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  255. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  257. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  259. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  260. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  261. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  262. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  263. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  264. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  265. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  267. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  268. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  270. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  272. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  273. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  274. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  276. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  277. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  278. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  280. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  281. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  282. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  294. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  295. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  296. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  298. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  299. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  309. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  310. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  311. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  312. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  313. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  314. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  318. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  320. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  321. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  322. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  323. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  324. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  326. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  330. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  334. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  458. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  460. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  461. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  462. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  463. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  464. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  465. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  466. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
  467. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +243 -0
  468. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +140 -0
  469. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  470. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  471. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  472. data/ext/sources/ggml/src/ggml-impl.h +601 -0
  473. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  474. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  509. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  510. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  511. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  512. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  513. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  514. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +5998 -0
  515. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +7089 -0
  516. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  517. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  518. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  519. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  520. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  557. data/ext/sources/ggml/src/ggml-opt.cpp +1037 -0
  558. data/ext/sources/ggml/src/ggml-quants.c +5232 -0
  559. data/ext/sources/ggml/src/ggml-quants.h +100 -0
  560. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  561. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +1813 -0
  562. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  563. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  564. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  565. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  566. data/ext/sources/ggml/src/ggml-sycl/common.cpp +83 -0
  567. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  568. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +195 -0
  569. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  570. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +101 -0
  571. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +623 -0
  573. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  574. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  575. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  576. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  577. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +1162 -0
  578. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  579. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  580. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  581. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  582. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  583. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  584. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  585. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +4493 -0
  586. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  587. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  588. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  589. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  590. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  591. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  592. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1110 -0
  593. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  594. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +501 -0
  595. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  596. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +47 -0
  597. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  598. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  599. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  600. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  601. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  602. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +261 -0
  603. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  604. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  605. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  606. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  607. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  608. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  609. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  610. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  611. data/ext/sources/ggml/src/ggml-threading.cpp +12 -0
  612. data/ext/sources/ggml/src/ggml-threading.h +14 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10700 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +751 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  729. data/ext/sources/ggml/src/ggml.c +6550 -0
  730. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  731. data/ext/{whisper.h → sources/include/whisper.h} +91 -24
  732. data/ext/sources/src/CMakeLists.txt +143 -0
  733. data/ext/sources/src/coreml/whisper-decoder-impl.h +158 -0
  734. data/ext/sources/src/coreml/whisper-decoder-impl.m +226 -0
  735. data/ext/sources/src/coreml/whisper-encoder-impl.h +154 -0
  736. data/ext/sources/src/coreml/whisper-encoder-impl.m +222 -0
  737. data/ext/sources/src/coreml/whisper-encoder.h +26 -0
  738. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  739. data/ext/sources/src/openvino/whisper-openvino-encoder.cpp +108 -0
  740. data/ext/sources/src/openvino/whisper-openvino-encoder.h +31 -0
  741. data/ext/sources/src/whisper-arch.h +197 -0
  742. data/ext/{whisper.cpp → sources/src/whisper.cpp} +2535 -835
  743. data/ext/sources/tests/CMakeLists.txt +105 -0
  744. data/ext/sources/tests/earnings21/eval.mk +58 -0
  745. data/ext/sources/tests/earnings21/eval.py +68 -0
  746. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  747. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  748. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  749. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  750. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  751. data/ext/sources/tests/en-0-ref.txt +1 -0
  752. data/ext/sources/tests/en-1-ref.txt +1 -0
  753. data/ext/sources/tests/en-2-ref.txt +1 -0
  754. data/ext/sources/tests/es-0-ref.txt +1 -0
  755. data/ext/sources/tests/librispeech/eval.mk +39 -0
  756. data/ext/sources/tests/librispeech/eval.py +47 -0
  757. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  758. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  759. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  760. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  761. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  762. data/ext/sources/tests/run-tests.sh +130 -0
  763. data/ext/sources/tests/test-c.c +3 -0
  764. data/ext/sources/tests/test-vad-full.cpp +54 -0
  765. data/ext/sources/tests/test-vad.cpp +83 -0
  766. data/ext/sources/tests/test-whisper.js +58 -0
  767. data/extsources.rb +34 -0
  768. data/lib/whisper/model/uri.rb +178 -0
  769. data/sig/whisper.rbs +480 -0
  770. data/tests/helper.rb +35 -0
  771. data/tests/jfk_reader/.gitignore +5 -0
  772. data/tests/jfk_reader/extconf.rb +3 -0
  773. data/tests/jfk_reader/jfk_reader.c +68 -0
  774. data/tests/test_callback.rb +202 -0
  775. data/tests/test_error.rb +20 -0
  776. data/tests/test_model.rb +109 -0
  777. data/tests/test_package.rb +46 -0
  778. data/tests/test_params.rb +297 -0
  779. data/tests/test_segment.rb +74 -0
  780. data/tests/test_vad.rb +19 -0
  781. data/tests/test_vad_params.rb +103 -0
  782. data/tests/test_whisper.rb +212 -124
  783. data/whispercpp.gemspec +37 -0
  784. metadata +794 -13
  785. data/ext/dr_wav.h +0 -6434
  786. data/ext/ggml.c +0 -21755
  787. data/ext/ruby_whisper.cpp +0 -426
@@ -0,0 +1,15 @@
1
+ #pragma once
2
+
3
+ #include <cuda_runtime.h>
4
+ #include <cuda.h>
5
+ #include <cublas_v2.h>
6
+ #include <cuda_bf16.h>
7
+ #include <cuda_fp16.h>
8
+
9
+ #if CUDART_VERSION < 11020
10
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
11
+ #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
12
+ #define CUBLAS_COMPUTE_16F CUDA_R_16F
13
+ #define CUBLAS_COMPUTE_32F CUDA_R_32F
14
+ #define cublasComputeType_t cudaDataType_t
15
+ #endif // CUDART_VERSION < 11020
@@ -0,0 +1,243 @@
1
+ #pragma once
2
+
3
+ #define HIP_ENABLE_WARP_SYNC_BUILTINS 1
4
+ #include <hip/hip_runtime.h>
5
+ #include <hipblas/hipblas.h>
6
+ #include <hip/hip_fp16.h>
7
+ #include <hip/hip_bfloat16.h>
8
+ #ifdef __HIP_PLATFORM_AMD__
9
+ // for rocblas_initialize()
10
+ #include "rocblas/rocblas.h"
11
+ #endif // __HIP_PLATFORM_AMD__
12
+
13
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
14
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
15
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
16
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
17
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
18
+ #define CUBLAS_OP_N HIPBLAS_OP_N
19
+ #define CUBLAS_OP_T HIPBLAS_OP_T
20
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
21
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
22
+ #define CUDA_R_16F HIPBLAS_R_16F
23
+ #define CUDA_R_16BF HIPBLAS_R_16B
24
+ #define CUDA_R_32F HIPBLAS_R_32F
25
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
26
+ #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
27
+ #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
28
+ #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
29
+ #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
30
+ #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
31
+ #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
32
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
33
+ #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
34
+ #define cublasCreate hipblasCreate
35
+ #define cublasDestroy hipblasDestroy
36
+ #define cublasGemmEx hipblasGemmEx
37
+ #define cublasGemmBatchedEx hipblasGemmBatchedEx
38
+ #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
39
+ #define cublasHandle_t hipblasHandle_t
40
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
41
+ #define cublasSetStream hipblasSetStream
42
+ #define cublasSgemm hipblasSgemm
43
+ #define cublasStatus_t hipblasStatus_t
44
+ #define cublasOperation_t hipblasOperation_t
45
+ #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
46
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
47
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
48
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
49
+ #define cudaDeviceProp hipDeviceProp_t
50
+ #define cudaDeviceSynchronize hipDeviceSynchronize
51
+ #define cudaError_t hipError_t
52
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
53
+ #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
54
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
55
+ #define cudaEventDisableTiming hipEventDisableTiming
56
+ #define cudaEventRecord hipEventRecord
57
+ #define cudaEventSynchronize hipEventSynchronize
58
+ #define cudaEvent_t hipEvent_t
59
+ #define cudaEventDestroy hipEventDestroy
60
+ #define cudaFree hipFree
61
+ #define cudaFreeHost hipHostFree
62
+ #define cudaGetDevice hipGetDevice
63
+ #define cudaGetDeviceCount hipGetDeviceCount
64
+ #define cudaGetDeviceProperties hipGetDeviceProperties
65
+ #define cudaGetErrorString hipGetErrorString
66
+ #define cudaGetLastError hipGetLastError
67
+ #define cudaHostRegister hipHostRegister
68
+ #define cudaHostRegisterPortable hipHostRegisterPortable
69
+ #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
70
+ #define cudaHostUnregister hipHostUnregister
71
+ #define cudaLaunchHostFunc hipLaunchHostFunc
72
+ #define cudaMalloc hipMalloc
73
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
74
+ #define cudaMallocManaged hipMallocManaged
75
+ #define cudaMemAdvise hipMemAdvise
76
+ #define cudaMemcpy hipMemcpy
77
+ #define cudaMemcpyAsync hipMemcpyAsync
78
+ #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
79
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
80
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
81
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
82
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
83
+ #define cudaMemcpyKind hipMemcpyKind
84
+ #define cudaMemset hipMemset
85
+ #define cudaMemsetAsync hipMemsetAsync
86
+ #define cudaMemGetInfo hipMemGetInfo
87
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
88
+ #define cudaSetDevice hipSetDevice
89
+ #define cuDeviceGet hipDeviceGet
90
+ #define CUdevice hipDevice_t
91
+ #define CUdeviceptr hipDeviceptr_t
92
+ #define cuMemUnmap hipMemUnmap
93
+ #define CUmemAccessDesc hipMemAccessDesc
94
+ #define cuMemAddressFree hipMemAddressFree
95
+ #define cuMemRelease hipMemRelease
96
+ #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
97
+ #define cuMemCreate hipMemCreate
98
+ #define cuMemAddressReserve hipMemAddressReserve
99
+ #define cuMemMap hipMemMap
100
+ #define cuMemSetAccess hipMemSetAccess
101
+ #define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
102
+ #define CUmemAllocationProp hipMemAllocationProp
103
+ #define cuDeviceGetAttribute hipDeviceGetAttribute
104
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
105
+ #define cudaStreamDestroy hipStreamDestroy
106
+ #define cudaStreamFireAndForget hipStreamFireAndForget
107
+ #define cudaStreamNonBlocking hipStreamNonBlocking
108
+ #define cudaStreamPerThread hipStreamPerThread
109
+ #define cudaStreamSynchronize hipStreamSynchronize
110
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
111
+ #define cudaGraphExec_t hipGraphExec_t
112
+ #define cudaGraphNode_t hipGraphNode_t
113
+ #define cudaKernelNodeParams hipKernelNodeParams
114
+ #define cudaKernelNodeParams hipKernelNodeParams
115
+ #define cudaGraphExecDestroy hipGraphExecDestroy
116
+ #define cudaGraphLaunch hipGraphLaunch
117
+ #define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
118
+ #define cudaGraphExecUpdateResult hipGraphExecUpdateResult
119
+ #define cudaGraphNodeType hipGraphNodeType
120
+ #define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
121
+ #define cudaGraphInstantiate hipGraphInstantiate
122
+ #define cudaStreamEndCapture hipStreamEndCapture
123
+ #define cudaGraphDestroy hipGraphDestroy
124
+ #define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
125
+ #define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
126
+ #define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
127
+ #define cudaGraphNodeGetType hipGraphNodeGetType
128
+ #define cudaGraphGetNodes hipGraphGetNodes
129
+ #define cudaGraphExecUpdate hipGraphExecUpdate
130
+ #define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
131
+ #define cudaStreamBeginCapture hipStreamBeginCapture
132
+ #define cudaGraph_t hipGraph_t
133
+ #define cudaStream_t hipStream_t
134
+ #define cudaSuccess hipSuccess
135
+ #define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
136
+ #define __trap() do { abort(); __builtin_unreachable(); } while(0)
137
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
138
+ #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
139
+ #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
140
+ #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
141
+ #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
142
+ #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
143
+ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
144
+ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
145
+ #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
146
+
147
+ #define __CUDA_ARCH__ 1300
148
+
149
+ #if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
150
+ #define GCN
151
+ #endif
152
+
153
+ #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
154
+ #define CDNA
155
+ #endif
156
+
157
+ #if defined(__GFX12__)
158
+ #define RDNA4
159
+ #endif
160
+
161
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
162
+ defined(__gfx1150__) || defined(__gfx1151__)
163
+ #define RDNA3
164
+ #endif
165
+
166
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
167
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
168
+ #define RDNA2
169
+ #endif
170
+
171
+ #if defined(__gfx1010__) || defined(__gfx1012__)
172
+ #define RDNA1
173
+ #endif
174
+
175
+ #ifndef __has_builtin
176
+ #define __has_builtin(x) 0
177
+ #endif
178
+
179
+ typedef hip_bfloat16 nv_bfloat16;
180
+
181
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
182
+ typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
183
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
184
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
185
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
186
+ #if __has_builtin(__builtin_elementwise_sub_sat)
187
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
188
+ return reinterpret_cast<const int &>(c);
189
+ #else
190
+ int8x4_t c;
191
+ int16_t tmp;
192
+ #pragma unroll
193
+ for (int i = 0; i < 4; i++) {
194
+ tmp = va[i] - vb[i];
195
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
196
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
197
+ c[i] = tmp;
198
+ }
199
+ return reinterpret_cast<int &>(c);
200
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
201
+ }
202
+
203
+ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
204
+ return __vsubss4(a, b);
205
+ }
206
+
207
+ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
208
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
209
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
210
+ unsigned int c;
211
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
212
+ #pragma unroll
213
+ for (int i = 0; i < 4; ++i) {
214
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00;
215
+ }
216
+ return c;
217
+ }
218
+
219
+ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
220
+ const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
221
+ const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
222
+ unsigned int c;
223
+ uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
224
+ #pragma unroll
225
+ for (int i = 0; i < 4; ++i) {
226
+ vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
227
+ }
228
+ return c;
229
+ }
230
+
231
+ #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
232
+ // __shfl_xor() for half2 was added in ROCm 5.6
233
+ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
234
+ typedef union half2_b32 {
235
+ half2 val;
236
+ int b32;
237
+ } half2_b32_t;
238
+ half2_b32_t tmp;
239
+ tmp.val = var;
240
+ tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
241
+ return tmp.val;
242
+ }
243
+ #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
@@ -0,0 +1,140 @@
1
+ #pragma once
2
+
3
+ #include <musa_runtime.h>
4
+ #include <musa.h>
5
+ #include <mublas.h>
6
+ #include <musa_bf16.h>
7
+ #include <musa_fp16.h>
8
+ #define CUBLAS_COMPUTE_16F CUDA_R_16F
9
+ #define CUBLAS_COMPUTE_32F CUDA_R_32F
10
+ #define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
11
+ #define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
12
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
13
+ #define CUBLAS_OP_N MUBLAS_OP_N
14
+ #define CUBLAS_OP_T MUBLAS_OP_T
15
+ #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
16
+ #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
17
+ #define CUDA_R_16F MUSA_R_16F
18
+ #define CUDA_R_16BF MUSA_R_16BF
19
+ #define CUDA_R_32F MUSA_R_32F
20
+ #define cublasComputeType_t cudaDataType_t
21
+ #define cublasCreate mublasCreate
22
+ #define cublasDestroy mublasDestroy
23
+ #define cublasGemmEx mublasGemmEx
24
+ #define cublasGemmBatchedEx mublasGemmBatchedEx
25
+ #define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
26
+ #define cublasHandle_t mublasHandle_t
27
+ #define cublasSetMathMode mublasSetMathMode
28
+ #define cublasSetStream mublasSetStream
29
+ #define cublasSgemm mublasSgemm
30
+ #define cublasStatus_t mublasStatus_t
31
+ #define cublasOperation_t mublasOperation_t
32
+ #define cublasGetStatusString mublasStatus_to_string
33
+ #define cudaDataType_t musaDataType_t
34
+ #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
35
+ #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
36
+ #define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
37
+ #define cudaDeviceProp musaDeviceProp
38
+ #define cudaDeviceSynchronize musaDeviceSynchronize
39
+ #define cudaError_t musaError_t
40
+ #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
41
+ #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
42
+ #define cudaEventCreateWithFlags musaEventCreateWithFlags
43
+ #define cudaEventDisableTiming musaEventDisableTiming
44
+ #define cudaEventRecord musaEventRecord
45
+ #define cudaEventSynchronize musaEventSynchronize
46
+ #define cudaEvent_t musaEvent_t
47
+ #define cudaEventDestroy musaEventDestroy
48
+ #define cudaFree musaFree
49
+ #define cudaFreeHost musaFreeHost
50
+ #define cudaGetDevice musaGetDevice
51
+ #define cudaGetDeviceCount musaGetDeviceCount
52
+ #define cudaGetDeviceProperties musaGetDeviceProperties
53
+ #define cudaGetErrorString musaGetErrorString
54
+ #define cudaGetLastError musaGetLastError
55
+ #define cudaHostRegister musaHostRegister
56
+ #define cudaHostRegisterPortable musaHostRegisterPortable
57
+ #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
58
+ #define cudaHostUnregister musaHostUnregister
59
+ #define cudaLaunchHostFunc musaLaunchHostFunc
60
+ #define cudaMalloc musaMalloc
61
+ #define cudaMallocHost musaMallocHost
62
+ #define cudaMallocManaged musaMallocManaged
63
+ #define cudaMemcpy musaMemcpy
64
+ #define cudaMemcpyAsync musaMemcpyAsync
65
+ #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
66
+ #define cudaMemcpy2DAsync musaMemcpy2DAsync
67
+ #define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
68
+ #define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
69
+ #define cudaMemcpyHostToDevice musaMemcpyHostToDevice
70
+ #define cudaMemcpyKind musaMemcpyKind
71
+ #define cudaMemset musaMemset
72
+ #define cudaMemsetAsync musaMemsetAsync
73
+ #define cudaMemGetInfo musaMemGetInfo
74
+ #define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
75
+ #define cudaSetDevice musaSetDevice
76
+ #define cudaStreamCreateWithFlags musaStreamCreateWithFlags
77
+ #define cudaStreamDestroy musaStreamDestroy
78
+ #define cudaStreamFireAndForget musaStreamFireAndForget
79
+ #define cudaStreamNonBlocking musaStreamNonBlocking
80
+ #define cudaStreamPerThread musaStreamPerThread
81
+ #define cudaStreamSynchronize musaStreamSynchronize
82
+ #define cudaStreamWaitEvent musaStreamWaitEvent
83
+ #define cudaStream_t musaStream_t
84
+ #define cudaSuccess musaSuccess
85
+
86
+ // Additional mappings for MUSA virtual memory pool
87
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
88
+ #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
89
+ #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
90
+ #define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
91
+ #define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
92
+ #define CUdevice MUdevice
93
+ #define CUdeviceptr MUdeviceptr
94
+ #define CUmemAccessDesc MUmemAccessDesc
95
+ #define CUmemAllocationProp MUmemAllocationProp
96
+ #define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
97
+ #define cuDeviceGet muDeviceGet
98
+ #define cuDeviceGetAttribute muDeviceGetAttribute
99
+ #define cuMemAddressFree muMemAddressFree
100
+ #define cuMemAddressReserve muMemAddressReserve
101
+ #define cuMemCreate muMemCreate
102
+ #define cuMemGetAllocationGranularity muMemGetAllocationGranularity
103
+ #define cuMemMap muMemMap
104
+ #define cuMemRelease muMemRelease
105
+ #define cuMemSetAccess muMemSetAccess
106
+ #define cuMemUnmap muMemUnmap
107
+ #define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
108
+ #define cudaFuncSetAttribute musaFuncSetAttribute
109
+ #define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
110
+ #define make_cudaExtent make_musaExtent
111
+ #define make_cudaPitchedPtr make_musaPitchedPtr
112
+
113
+ // Additional mappings for MUSA graphs
114
+ #define CUDA_SUCCESS MUSA_SUCCESS
115
+ #define CUresult MUresult
116
+ #define cuGetErrorString muGetErrorString
117
+ #define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
118
+ #define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
119
+ #define cudaGraphDestroy musaGraphDestroy
120
+ #define cudaGraphExecDestroy musaGraphExecDestroy
121
+ #define cudaGraphExec_t musaGraphExec_t
122
+ #define cudaGraphExecUpdate musaGraphExecUpdate
123
+ #define cudaGraphExecUpdateResult musaGraphExecUpdateResult
124
+ #define cudaGraphGetNodes musaGraphGetNodes
125
+ #define cudaGraphInstantiate musaGraphInstantiate
126
+ #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
127
+ #define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
128
+ #define cudaGraphLaunch musaGraphLaunch
129
+ #define cudaGraphNodeGetType musaGraphNodeGetType
130
+ #define cudaGraphNode_t musaGraphNode_t
131
+ #define cudaGraphNodeType musaGraphNodeType
132
+ #define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
133
+ #define cudaGraph_t musaGraph_t
134
+ #define cudaKernelNodeParams musaKernelNodeParams
135
+ #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
136
+ #define cudaStreamBeginCapture musaStreamBeginCapture
137
+ #define cudaStreamEndCapture musaStreamEndCapture
138
+ #define cudaOccupancyMaxActiveBlocksPerMultiprocessor musaOccupancyMaxActiveBlocksPerMultiprocessor
139
+
140
+ typedef mt_bfloat16 nv_bfloat16;
@@ -0,0 +1,199 @@
1
+ #include "common.cuh"
2
+ #include "wkv.cuh"
3
+
4
+ template <int block_size>
5
+ static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
6
+ const int tid = threadIdx.x;
7
+ const int bid = blockIdx.x;
8
+
9
+ const int head_size = block_size;
10
+ const int batch_i = bid / H;
11
+ const int head_i = bid % H;
12
+ const int state_size = C * head_size;
13
+ const int n_seq_tokens = T / B;
14
+
15
+ float state[head_size];
16
+ __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size];
17
+
18
+ #pragma unroll
19
+ for (int i = 0; i < head_size; i++) {
20
+ state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
21
+ }
22
+
23
+ __syncthreads();
24
+ _tf[tid] = tf[head_i * head_size + tid];
25
+ __syncthreads();
26
+
27
+ for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
28
+ __syncthreads();
29
+ _k[tid] = k[t];
30
+ _r[tid] = r[t];
31
+ _td[tid] = td[t];
32
+ __syncthreads();
33
+
34
+ const float _v = v[t];
35
+ float y = 0;
36
+ for (int j = 0; j < head_size; j += 4) {
37
+ const float4& k = (float4&)(_k[j]);
38
+ const float4& r = (float4&)(_r[j]);
39
+ const float4& tf = (float4&)(_tf[j]);
40
+ const float4& td = (float4&)(_td[j]);
41
+ float4& s = (float4&)(state[j]);
42
+ float4 kv;
43
+
44
+ kv.x = k.x * _v;
45
+ kv.y = k.y * _v;
46
+ kv.z = k.z * _v;
47
+ kv.w = k.w * _v;
48
+
49
+ y += r.x * (tf.x * kv.x + s.x);
50
+ y += r.y * (tf.y * kv.y + s.y);
51
+ y += r.z * (tf.z * kv.z + s.z);
52
+ y += r.w * (tf.w * kv.w + s.w);
53
+
54
+ s.x = s.x * td.x + kv.x;
55
+ s.y = s.y * td.y + kv.y;
56
+ s.z = s.z * td.z + kv.z;
57
+ s.w = s.w * td.w + kv.w;
58
+ }
59
+ dst[t] = y;
60
+ }
61
+
62
+ #pragma unroll
63
+ for (int i = 0; i < head_size; i++) {
64
+ dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
65
+ }
66
+ }
67
+
68
+ template <int block_size>
69
+ static __global__ void rwkv_wkv7_f32(const int B, const int T, const int C, const int H, const float * r, const float * w, const float * k, const float * v, const float * a, const float * b, const float * s, float * dst) {
70
+ const int tid = threadIdx.x;
71
+ const int bid = blockIdx.x;
72
+
73
+ const int head_size = block_size;
74
+ const int batch_i = bid / H;
75
+ const int head_i = bid % H;
76
+ const int state_size = C * head_size;
77
+ const int n_seq_tokens = T / B;
78
+
79
+ float state[head_size];
80
+ __shared__ float _r[head_size], _w[head_size], _k[head_size], _a[head_size], _b[head_size];
81
+
82
+ #ifndef GGML_USE_MUSA
83
+ #pragma unroll
84
+ #endif
85
+ for (int i = 0; i < head_size; i++) {
86
+ state[i] = s[batch_i * state_size + head_i * head_size * head_size + tid * head_size + i];
87
+ }
88
+
89
+ for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
90
+ __syncthreads();
91
+ _r[tid] = r[t];
92
+ _w[tid] = w[t];
93
+ _k[tid] = k[t];
94
+ _a[tid] = a[t];
95
+ _b[tid] = b[t];
96
+ __syncthreads();
97
+
98
+ float sa = 0;
99
+ #pragma unroll
100
+ for (int j = 0; j < head_size; j += 4)
101
+ {
102
+ const float4& a = (float4&)(_a[j]);
103
+ const float4& s = (float4&)(state[j]);
104
+ sa += a.x * s.x;
105
+ sa += a.y * s.y;
106
+ sa += a.z * s.z;
107
+ sa += a.w * s.w;
108
+ }
109
+
110
+ const float _v = v[t];
111
+ float y = 0;
112
+ for (int j = 0; j < head_size; j += 4) {
113
+ const float4& r = (float4&)(_r[j]);
114
+ const float4& w = (float4&)(_w[j]);
115
+ const float4& k = (float4&)(_k[j]);
116
+ const float4& b = (float4&)(_b[j]);
117
+ float4& s = (float4&)(state[j]);
118
+ float4 kv;
119
+
120
+ kv.x = k.x * _v;
121
+ kv.y = k.y * _v;
122
+ kv.z = k.z * _v;
123
+ kv.w = k.w * _v;
124
+
125
+ s.x = s.x * w.x + kv.x + sa * b.x;
126
+ s.y = s.y * w.y + kv.y + sa * b.y;
127
+ s.z = s.z * w.z + kv.z + sa * b.z;
128
+ s.w = s.w * w.w + kv.w + sa * b.w;
129
+
130
+ y += s.x * r.x;
131
+ y += s.y * r.y;
132
+ y += s.z * r.z;
133
+ y += s.w * r.w;
134
+ }
135
+ dst[t] = y;
136
+ }
137
+
138
+ #pragma unroll
139
+ for (int i = 0; i < head_size; i++) {
140
+ dst[T * C + batch_i * state_size + head_i * head_size * head_size + tid * head_size + i] = state[i];
141
+ }
142
+ }
143
+
144
+ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
145
+ const float * k_d = (const float *)dst->src[0]->data;
146
+ const float * v_d = (const float *)dst->src[1]->data;
147
+ const float * r_d = (const float *)dst->src[2]->data;
148
+ const float * tf_d = (const float *)dst->src[3]->data;
149
+ const float * td_d = (const float *)dst->src[4]->data;
150
+ const float * s_d = (const float *)dst->src[5]->data;
151
+
152
+ const int64_t B = dst->src[5]->ne[1];
153
+ const int64_t T = dst->src[0]->ne[2];
154
+ const int64_t C = dst->ne[0];
155
+ const int64_t H = dst->src[0]->ne[1];
156
+
157
+ float * dst_d = (float *)dst->data;
158
+
159
+ cudaStream_t stream = ctx.stream();
160
+
161
+ GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
162
+ GGML_ASSERT(C % H == 0);
163
+ GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE || C / H == CUDA_WKV_BLOCK_SIZE * 2);
164
+
165
+ if (C / H == CUDA_WKV_BLOCK_SIZE) {
166
+ rwkv_wkv_f32<CUDA_WKV_BLOCK_SIZE><<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
167
+ } else {
168
+ rwkv_wkv_f32<CUDA_WKV_BLOCK_SIZE * 2><<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
169
+ }
170
+ }
171
+
172
+ void ggml_cuda_op_rwkv_wkv7(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
173
+ const float * r_d = (const float *)dst->src[0]->data;
174
+ const float * w_d = (const float *)dst->src[1]->data;
175
+ const float * k_d = (const float *)dst->src[2]->data;
176
+ const float * v_d = (const float *)dst->src[3]->data;
177
+ const float * a_d = (const float *)dst->src[4]->data;
178
+ const float * b_d = (const float *)dst->src[5]->data;
179
+ const float * s_d = (const float *)dst->src[6]->data;
180
+
181
+ const int64_t B = dst->src[6]->ne[1];
182
+ const int64_t T = dst->src[0]->ne[2];
183
+ const int64_t C = dst->ne[0];
184
+ const int64_t H = dst->src[0]->ne[1];
185
+
186
+ float * dst_d = (float *)dst->data;
187
+
188
+ cudaStream_t stream = ctx.stream();
189
+
190
+ GGML_ASSERT(dst->src[6]->type == GGML_TYPE_F32);
191
+ GGML_ASSERT(C % H == 0);
192
+ GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE || C / H == CUDA_WKV_BLOCK_SIZE * 2);
193
+
194
+ if (C / H == CUDA_WKV_BLOCK_SIZE) {
195
+ rwkv_wkv7_f32<CUDA_WKV_BLOCK_SIZE><<<B * H, C / H, 0, stream>>>(B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d);
196
+ } else {
197
+ rwkv_wkv7_f32<CUDA_WKV_BLOCK_SIZE * 2><<<B * H, C / H, 0, stream>>>(B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d);
198
+ }
199
+ }
@@ -0,0 +1,7 @@
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_WKV_BLOCK_SIZE 64
4
+
5
+ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6
+
7
+ void ggml_cuda_op_rwkv_wkv7(ggml_backend_cuda_context & ctx, ggml_tensor * dst);