whispercpp 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (787) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -0
  3. data/LICENSE +1 -1
  4. data/README.md +216 -424
  5. data/Rakefile +79 -11
  6. data/ext/.gitignore +11 -0
  7. data/ext/dependencies.rb +61 -0
  8. data/ext/extconf.rb +18 -26
  9. data/ext/options.rb +221 -0
  10. data/ext/ruby_whisper.c +159 -0
  11. data/ext/ruby_whisper.h +27 -2
  12. data/ext/ruby_whisper_context.c +641 -0
  13. data/ext/ruby_whisper_error.c +52 -0
  14. data/ext/ruby_whisper_model.c +232 -0
  15. data/ext/ruby_whisper_params.c +1301 -0
  16. data/ext/ruby_whisper_segment.c +143 -0
  17. data/ext/ruby_whisper_transcribe.cpp +87 -0
  18. data/ext/ruby_whisper_vad_params.c +288 -0
  19. data/ext/sources/.dockerignore +3 -0
  20. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  21. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  22. data/ext/sources/CMakeLists.txt +251 -0
  23. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  24. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  25. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  26. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  27. data/ext/sources/bindings/javascript/package.json +26 -0
  28. data/ext/sources/bindings/javascript/whisper.js +19 -0
  29. data/ext/sources/build-xcframework.sh +547 -0
  30. data/ext/sources/ci/run.sh +336 -0
  31. data/ext/sources/close-issue.yml +28 -0
  32. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  33. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  34. data/ext/sources/cmake/build-info.cmake +60 -0
  35. data/ext/sources/cmake/git-vars.cmake +22 -0
  36. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  37. data/ext/sources/cmake/whisper.pc.in +10 -0
  38. data/ext/sources/examples/CMakeLists.txt +124 -0
  39. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  40. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  41. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  42. data/ext/sources/examples/addon.node/index.js +54 -0
  43. data/ext/sources/examples/addon.node/package.json +16 -0
  44. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  45. data/ext/sources/examples/bench/bench.cpp +175 -0
  46. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  47. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  48. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  49. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  50. data/ext/sources/examples/cli/cli.cpp +1294 -0
  51. data/ext/sources/examples/coi-serviceworker.js +146 -0
  52. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  53. data/ext/sources/examples/command/command.cpp +776 -0
  54. data/ext/sources/examples/command/commands.txt +9 -0
  55. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  56. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  57. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  58. data/ext/sources/examples/common-ggml.cpp +238 -0
  59. data/ext/sources/examples/common-ggml.h +18 -0
  60. data/ext/sources/examples/common-sdl.cpp +227 -0
  61. data/ext/sources/examples/common-sdl.h +49 -0
  62. data/ext/sources/examples/common-whisper.cpp +168 -0
  63. data/ext/sources/examples/common-whisper.h +24 -0
  64. data/ext/sources/examples/common.cpp +675 -0
  65. data/ext/sources/examples/common.h +322 -0
  66. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  67. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  68. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  69. data/ext/sources/examples/generate-karaoke.sh +57 -0
  70. data/ext/sources/examples/grammar-parser.cpp +423 -0
  71. data/ext/sources/examples/grammar-parser.h +29 -0
  72. data/ext/sources/examples/helpers.js +191 -0
  73. data/ext/sources/examples/json.hpp +24596 -0
  74. data/ext/sources/examples/livestream.sh +112 -0
  75. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  76. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  77. data/ext/sources/examples/lsp/whisper.vim +362 -0
  78. data/ext/sources/examples/miniaudio.h +93468 -0
  79. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  80. data/ext/sources/examples/python/whisper_processor.py +54 -0
  81. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  82. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  83. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  84. data/ext/sources/examples/server/bench.js +29 -0
  85. data/ext/sources/examples/server/httplib.h +10497 -0
  86. data/ext/sources/examples/server/server.cpp +1091 -0
  87. data/ext/sources/examples/server.py +115 -0
  88. data/ext/sources/examples/stb_vorbis.c +5584 -0
  89. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  90. data/ext/sources/examples/stream/stream.cpp +429 -0
  91. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  92. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  93. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  94. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  95. data/ext/sources/examples/sycl/build.sh +22 -0
  96. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  97. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  98. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  99. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  101. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  103. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  105. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  107. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  108. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  109. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  111. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  113. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  115. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  117. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  119. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  120. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  121. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  124. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  126. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  128. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  130. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  132. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  133. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  134. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  136. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  138. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  140. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  141. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  142. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  143. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  144. data/ext/sources/examples/talk-llama/speak +40 -0
  145. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  146. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  147. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  149. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  150. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  151. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  152. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  153. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  154. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  155. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  157. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  159. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  160. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  162. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  163. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  164. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  165. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  166. data/ext/sources/ggml/CMakeLists.txt +390 -0
  167. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  168. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  169. data/ext/sources/ggml/cmake/common.cmake +26 -0
  170. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  171. data/ext/sources/ggml/include/ggml-alloc.h +76 -0
  172. data/ext/sources/ggml/include/ggml-backend.h +354 -0
  173. data/ext/sources/ggml/include/ggml-blas.h +25 -0
  174. data/ext/sources/ggml/include/ggml-cann.h +123 -0
  175. data/ext/sources/ggml/include/ggml-cpp.h +39 -0
  176. data/ext/sources/ggml/include/ggml-cpu.h +143 -0
  177. data/ext/sources/ggml/include/ggml-cuda.h +47 -0
  178. data/ext/sources/ggml/include/ggml-kompute.h +50 -0
  179. data/ext/sources/ggml/include/ggml-metal.h +66 -0
  180. data/ext/sources/ggml/include/ggml-opencl.h +26 -0
  181. data/ext/sources/ggml/include/ggml-opt.h +237 -0
  182. data/ext/sources/ggml/include/ggml-rpc.h +33 -0
  183. data/ext/sources/ggml/include/ggml-sycl.h +49 -0
  184. data/ext/sources/ggml/include/ggml-vulkan.h +29 -0
  185. data/ext/{ggml.h → sources/ggml/include/ggml.h} +621 -821
  186. data/ext/sources/ggml/include/gguf.h +202 -0
  187. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  188. data/ext/sources/ggml/src/ggml-alloc.c +1042 -0
  189. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  190. data/ext/sources/ggml/src/ggml-amx/common.h +94 -0
  191. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  192. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +2510 -0
  193. data/ext/sources/ggml/src/ggml-amx/mmq.h +17 -0
  194. data/ext/sources/ggml/src/ggml-backend-impl.h +255 -0
  195. data/ext/sources/ggml/src/ggml-backend-reg.cpp +586 -0
  196. data/ext/sources/ggml/src/ggml-backend.cpp +2011 -0
  197. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  198. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  199. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  200. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  201. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +181 -0
  202. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +258 -0
  203. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +3193 -0
  204. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  205. data/ext/sources/ggml/src/ggml-cann/common.h +420 -0
  206. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +2606 -0
  207. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  208. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  209. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +234 -0
  210. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  211. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  212. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  213. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  214. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  215. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  216. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  217. data/ext/sources/ggml/src/ggml-common.h +1857 -0
  218. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  219. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +221 -0
  220. data/ext/sources/ggml/src/ggml-cpu/amx/amx.h +8 -0
  221. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +91 -0
  222. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  223. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  224. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  225. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  226. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  227. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  228. data/ext/sources/ggml/src/ggml-cpu/cpu-feats-x86.cpp +327 -0
  229. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  232. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +508 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +13747 -0
  235. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  236. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  237. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  238. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  239. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +671 -0
  240. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  241. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  242. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  243. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  244. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  245. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  246. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  247. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  248. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  249. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  250. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  251. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  252. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  253. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  254. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  255. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  257. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  259. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  260. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  261. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  262. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  263. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  264. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  265. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  267. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  268. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  270. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  272. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  273. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  274. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  276. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  277. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  278. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  279. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  280. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  281. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  282. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  291. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  292. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  293. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  294. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  295. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  296. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  298. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  299. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  301. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  308. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  309. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  310. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  311. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  312. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  313. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  314. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  318. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  319. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  320. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  321. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  322. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  323. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  324. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  326. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  330. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  334. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  458. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  460. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  461. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  462. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  463. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  464. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  465. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  466. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +15 -0
  467. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +243 -0
  468. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +140 -0
  469. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  470. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  471. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  472. data/ext/sources/ggml/src/ggml-impl.h +601 -0
  473. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  474. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  509. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  510. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  511. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  512. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  513. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  514. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +5998 -0
  515. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +7089 -0
  516. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  517. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  518. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  519. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  520. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  557. data/ext/sources/ggml/src/ggml-opt.cpp +1037 -0
  558. data/ext/sources/ggml/src/ggml-quants.c +5232 -0
  559. data/ext/sources/ggml/src/ggml-quants.h +100 -0
  560. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  561. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +1813 -0
  562. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  563. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  564. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  565. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  566. data/ext/sources/ggml/src/ggml-sycl/common.cpp +83 -0
  567. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  568. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +195 -0
  569. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  570. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +101 -0
  571. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +623 -0
  573. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  574. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  575. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  576. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  577. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +1162 -0
  578. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  579. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  580. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  581. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  582. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  583. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  584. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  585. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +4493 -0
  586. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  587. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  588. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  589. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  590. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  591. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  592. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1110 -0
  593. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  594. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +501 -0
  595. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  596. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +47 -0
  597. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  598. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  599. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  600. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  601. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  602. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +261 -0
  603. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  604. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  605. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  606. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  607. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  608. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  609. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  610. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  611. data/ext/sources/ggml/src/ggml-threading.cpp +12 -0
  612. data/ext/sources/ggml/src/ggml-threading.h +14 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +10700 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +751 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  729. data/ext/sources/ggml/src/ggml.c +6550 -0
  730. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  731. data/ext/{whisper.h → sources/include/whisper.h} +91 -24
  732. data/ext/sources/src/CMakeLists.txt +143 -0
  733. data/ext/sources/src/coreml/whisper-decoder-impl.h +158 -0
  734. data/ext/sources/src/coreml/whisper-decoder-impl.m +226 -0
  735. data/ext/sources/src/coreml/whisper-encoder-impl.h +154 -0
  736. data/ext/sources/src/coreml/whisper-encoder-impl.m +222 -0
  737. data/ext/sources/src/coreml/whisper-encoder.h +26 -0
  738. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  739. data/ext/sources/src/openvino/whisper-openvino-encoder.cpp +108 -0
  740. data/ext/sources/src/openvino/whisper-openvino-encoder.h +31 -0
  741. data/ext/sources/src/whisper-arch.h +197 -0
  742. data/ext/{whisper.cpp → sources/src/whisper.cpp} +2535 -835
  743. data/ext/sources/tests/CMakeLists.txt +105 -0
  744. data/ext/sources/tests/earnings21/eval.mk +58 -0
  745. data/ext/sources/tests/earnings21/eval.py +68 -0
  746. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  747. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  748. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  749. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  750. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  751. data/ext/sources/tests/en-0-ref.txt +1 -0
  752. data/ext/sources/tests/en-1-ref.txt +1 -0
  753. data/ext/sources/tests/en-2-ref.txt +1 -0
  754. data/ext/sources/tests/es-0-ref.txt +1 -0
  755. data/ext/sources/tests/librispeech/eval.mk +39 -0
  756. data/ext/sources/tests/librispeech/eval.py +47 -0
  757. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  758. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  759. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  760. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  761. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  762. data/ext/sources/tests/run-tests.sh +130 -0
  763. data/ext/sources/tests/test-c.c +3 -0
  764. data/ext/sources/tests/test-vad-full.cpp +54 -0
  765. data/ext/sources/tests/test-vad.cpp +83 -0
  766. data/ext/sources/tests/test-whisper.js +58 -0
  767. data/extsources.rb +34 -0
  768. data/lib/whisper/model/uri.rb +178 -0
  769. data/sig/whisper.rbs +480 -0
  770. data/tests/helper.rb +35 -0
  771. data/tests/jfk_reader/.gitignore +5 -0
  772. data/tests/jfk_reader/extconf.rb +3 -0
  773. data/tests/jfk_reader/jfk_reader.c +68 -0
  774. data/tests/test_callback.rb +202 -0
  775. data/tests/test_error.rb +20 -0
  776. data/tests/test_model.rb +109 -0
  777. data/tests/test_package.rb +46 -0
  778. data/tests/test_params.rb +297 -0
  779. data/tests/test_segment.rb +74 -0
  780. data/tests/test_vad.rb +19 -0
  781. data/tests/test_vad_params.rb +103 -0
  782. data/tests/test_whisper.rb +212 -124
  783. data/whispercpp.gemspec +37 -0
  784. metadata +794 -13
  785. data/ext/dr_wav.h +0 -6434
  786. data/ext/ggml.c +0 -21755
  787. data/ext/ruby_whisper.cpp +0 -426
@@ -0,0 +1,1162 @@
1
+ #include "convert.hpp"
2
+ #include "dmmv.hpp"
3
+ #include "dequantize.hpp"
4
+ #include "presets.hpp"
5
+
6
+ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
7
+ const sycl::half *x = (const sycl::half *)vx;
8
+
9
+ // automatic half -> float type cast if dfloat == float
10
+ v.x() = x[ib + iqs + 0];
11
+ v.y() = x[ib + iqs + 1];
12
+ }
13
+
14
+ static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
15
+ const float * x = (const float *) vx;
16
+
17
+ // automatic half -> float type cast if dfloat == float
18
+ v.x() = x[ib + iqs + 0];
19
+ v.y() = x[ib + iqs + 1];
20
+ }
21
+
22
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
23
+ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
24
+ const sycl::nd_item<3> &item_ct1) {
25
+ // qk = quantized weights per x block
26
+ // qr = number of quantized weights per data value in x block
27
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
28
+ item_ct1.get_local_id(1);
29
+
30
+ if (row >= nrows) {
31
+ return;
32
+ }
33
+
34
+ const int tid = item_ct1.get_local_id(2);
35
+
36
+ const int iter_stride = 2*GGML_SYCL_DMMV_X;
37
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
38
+ const int y_offset = qr == 1 ? 1 : qk/2;
39
+
40
+ // partial sum for each thread
41
+ #ifdef GGML_SYCL_F16
42
+ sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
43
+ #else
44
+ float tmp = 0.0f;
45
+ #endif // GGML_SYCL_F16
46
+
47
+ for (int i = 0; i < ncols; i += iter_stride) {
48
+ const int col = i + vals_per_iter*tid;
49
+ const int ib = (row*ncols + col)/qk; // x block index
50
+ const int iqs = (col%qk)/qr; // x quant index
51
+ const int iybs = col - col%qk; // y block start index
52
+
53
+ // processing >2 values per i iter is faster for fast GPUs
54
+ #pragma unroll
55
+ for (int j = 0; j < vals_per_iter; j += 2) {
56
+ // process 2 vals per j iter
57
+
58
+ // dequantize
59
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
60
+ dfloat2 v;
61
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
62
+
63
+ // matrix multiplication
64
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
65
+ #ifdef GGML_SYCL_F16
66
+ dfloat2 t1{y[iybs + iqs + j / qr + 0],
67
+ y[iybs + iqs + j / qr + y_offset]};
68
+
69
+ tmp += v * t1;
70
+ #else
71
+ tmp += v.x() * y[iybs + iqs + j / qr + 0];
72
+ tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
73
+ #endif // GGML_SYCL_F16
74
+ }
75
+ }
76
+
77
+ // sum up partial sums and write back result
78
+ const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
79
+ for (int mask = mask_start; mask > 0; mask >>= 1) {
80
+ tmp +=
81
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
82
+ }
83
+
84
+ if (tid == 0) {
85
+ #ifdef GGML_SYCL_F16
86
+ dst[row] = tmp.x() + tmp.y();
87
+ #else
88
+ dst[row] = tmp;
89
+ #endif // GGML_SYCL_F16
90
+ }
91
+ }
92
+
93
+ template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
94
+ static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
95
+ const sycl::nd_item<3> &item_ct1) {
96
+ // qk = quantized weights per x block
97
+ // qr = number of quantized weights per data value in x block
98
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
99
+ item_ct1.get_local_id(1);
100
+
101
+ if (row >= nrows) {
102
+ return;
103
+ }
104
+
105
+ const int tid = item_ct1.get_local_id(2);
106
+
107
+
108
+ const int ncols_left = ncols % (QK4_0*WARP_SIZE);
109
+ const int ncols_align = ncols - ncols_left;
110
+ const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
111
+ const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
112
+ const int y_offset = qr == 1 ? 1 : qk/2;
113
+
114
+ // partial sum for each thread
115
+ #ifdef GGML_SYCL_F16
116
+ sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
117
+ #else
118
+ float tmp = 0.0f;
119
+ #endif // GGML_SYCL_F16
120
+ const char *d_ptr = (const char*)vx+ncols*nrows/2;
121
+ int i=0;
122
+ for (i = 0; i < ncols_align; i += iter_stride) {
123
+ const int col = i + vals_per_iter*tid;
124
+ const int ib = (row*ncols + col)/qk; // x block index
125
+ const int iqs = (col%qk)/qr; // x quant index
126
+ const int iybs = col - col%qk; // y block start index
127
+
128
+ // processing >2 values per i iter is faster for fast GPUs
129
+ #pragma unroll
130
+ for (int j = 0; j < vals_per_iter; j += 2) {
131
+ // process 2 vals per j iter
132
+
133
+ // dequantize
134
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
135
+ dfloat2 v;
136
+ dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
137
+
138
+ // matrix multiplication
139
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
140
+ #ifdef GGML_SYCL_F16
141
+ dfloat2 t1{y[iybs + iqs + j / qr + 0],
142
+ y[iybs + iqs + j / qr + y_offset]};
143
+
144
+ tmp += v * t1;
145
+ #else
146
+ tmp += v.x() * y[iybs + iqs + j / qr + 0];
147
+ tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
148
+ #endif // GGML_SYCL_F16
149
+ }
150
+ }
151
+
152
+ for (; i < ncols; i += iter_stride) {
153
+ if (tid>=ncols_left/QK4_0) continue;
154
+ const int col = i + vals_per_iter*tid;
155
+ const int ib = (row*ncols + col)/qk; // x block index
156
+ const int iqs = (col%qk)/qr; // x quant index
157
+ const int iybs = col - col%qk; // y block start index
158
+
159
+ // processing >2 values per i iter is faster for fast GPUs
160
+ #pragma unroll
161
+ for (int j = 0; j < vals_per_iter; j += 2) {
162
+ // process 2 vals per j iter
163
+
164
+ // dequantize
165
+ // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
166
+ dfloat2 v;
167
+ dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
168
+
169
+ // matrix multiplication
170
+ // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
171
+ #ifdef GGML_SYCL_F16
172
+ dfloat2 t1{y[iybs + iqs + j / qr + 0],
173
+ y[iybs + iqs + j / qr + y_offset]};
174
+
175
+ tmp += v * t1;
176
+ #else
177
+ tmp += v.x() * y[iybs + iqs + j / qr + 0];
178
+ tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
179
+ #endif // GGML_SYCL_F16
180
+ }
181
+ }
182
+
183
+ // sum up partial sums and write back result
184
+ const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
185
+ for (int mask = mask_start; mask > 0; mask >>= 1) {
186
+ tmp +=
187
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
188
+ }
189
+
190
+ if (tid == 0) {
191
+ #ifdef GGML_SYCL_F16
192
+ dst[row] = tmp.x() + tmp.y();
193
+ #else
194
+ dst[row] = tmp;
195
+ #endif // GGML_SYCL_F16
196
+ }
197
+ }
198
+
199
+ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
200
+ float *dst, const int ncols,
201
+ const int nrows,
202
+ dpct::queue_ptr stream) {
203
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
204
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
205
+ const sycl::range<3> block_nums(1, 1, block_num_y);
206
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
207
+ {
208
+ dpct::has_capability_or_fail(stream->get_device(),
209
+ {sycl::aspect::fp16});
210
+
211
+ stream->parallel_for(
212
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
213
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
214
+ dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
215
+ nrows, item_ct1);
216
+ });
217
+ }
218
+ }
219
+
220
+ /*
221
+ DPCT1110:4: The total declared local variable size in device function
222
+ dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
223
+ pressure. Consult with your hardware vendor to find the total register size
224
+ available and adjust the code, or use smaller sub-group size to avoid high
225
+ register pressure.
226
+ */
227
+ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
228
+ const float *__restrict__ yy,
229
+ float *__restrict__ dst,
230
+ const int ncols, int nrows,
231
+ const sycl::nd_item<3> &item_ct1) {
232
+
233
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
234
+
235
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
236
+ item_ct1.get_local_id(1);
237
+ if (row > nrows) return;
238
+
239
+ const int num_blocks_per_row = ncols / QK_K;
240
+ const int ib0 = row*num_blocks_per_row;
241
+
242
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
243
+
244
+ float tmp = 0; // partial sum for thread in warp
245
+
246
+ #if QK_K == 256
247
+ const int tid =
248
+ item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
249
+ const int ix =
250
+ item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
251
+
252
+ const int step = 16/K_QUANTS_PER_ITERATION;
253
+
254
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
255
+ const int in = tid - step*im; // 0...15 or 0...7
256
+
257
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
258
+ const int q_offset = 32*im + l0;
259
+ const int s_offset = 8*im;
260
+ const int y_offset = 128*im + l0;
261
+
262
+ uint32_t aux[4];
263
+ const uint8_t * d = (const uint8_t *)aux;
264
+ const uint8_t * m = (const uint8_t *)(aux + 2);
265
+
266
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
267
+
268
+ const float * y = yy + i * QK_K + y_offset;
269
+ const uint8_t * q = x[i].qs + q_offset;
270
+
271
+ const float dall = x[i].dm[0];
272
+ const float dmin = x[i].dm[1];
273
+
274
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
275
+ aux[0] = a[0] & 0x0f0f0f0f;
276
+ aux[1] = a[1] & 0x0f0f0f0f;
277
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
278
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
279
+
280
+ float sum1 = 0, sum2 = 0;
281
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
282
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
283
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
284
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
285
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
286
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
287
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
288
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
289
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
290
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
291
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
292
+
293
+ }
294
+ tmp += dall * sum1 - dmin * sum2;
295
+
296
+ }
297
+ #else
298
+ const int tid = item_ct1.get_local_id(2) /
299
+ (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
300
+ const int ix = item_ct1.get_local_id(2) %
301
+ (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
302
+ const int offset = tid * K_QUANTS_PER_ITERATION;
303
+
304
+ uint32_t uaux[2];
305
+ const uint8_t * d = (const uint8_t *)uaux;
306
+
307
+
308
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
309
+
310
+ const float * y = yy + i * QK_K + offset;
311
+ const uint8_t * q = x[i].qs + offset;
312
+ const uint32_t * s = (const uint32_t *)x[i].scales;
313
+
314
+ uaux[0] = s[0] & 0x0f0f0f0f;
315
+ uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
316
+
317
+ const sycl::float2 dall =
318
+ x[i].dm.convert<float, sycl::rounding_mode::automatic>();
319
+
320
+ float sum1 = 0, sum2 = 0;
321
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
322
+ const uint8_t ql = q[l];
323
+ sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
324
+ + y[l+16] * d[1] * ((ql >> 2) & 3)
325
+ + y[l+32] * d[2] * ((ql >> 4) & 3)
326
+ + y[l+48] * d[3] * ((ql >> 6) & 3);
327
+ sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
328
+ }
329
+ tmp += dall.x() * sum1 - dall.y() * sum2;
330
+ }
331
+
332
+ #endif
333
+
334
+ // sum up partial sums and write back result
335
+ #pragma unroll
336
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
337
+ tmp +=
338
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
339
+ }
340
+
341
+ if (item_ct1.get_local_id(2) == 0) {
342
+ dst[row] = tmp;
343
+ }
344
+ }
345
+
346
+ /*
347
+ DPCT1110:5: The total declared local variable size in device function
348
+ dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
349
+ pressure. Consult with your hardware vendor to find the total register size
350
+ available and adjust the code, or use smaller sub-group size to avoid high
351
+ register pressure.
352
+ */
353
+ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
354
+ const float *__restrict__ yy,
355
+ float *__restrict__ dst,
356
+ const int ncols, int nrows,
357
+ const sycl::nd_item<3> &item_ct1) {
358
+
359
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
360
+ item_ct1.get_local_id(1);
361
+ if (row > nrows) return;
362
+
363
+ const int num_blocks_per_row = ncols / QK_K;
364
+ const int ib0 = row*num_blocks_per_row;
365
+
366
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
367
+
368
+ float tmp = 0; // partial sum for thread in warp
369
+
370
+ #if QK_K == 256
371
+
372
+ const uint16_t kmask1 = 0x0303;
373
+ const uint16_t kmask2 = 0x0f0f;
374
+
375
+ const int tid =
376
+ item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
377
+ const int ix =
378
+ item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
379
+
380
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
381
+ const int step = 16/K_QUANTS_PER_ITERATION;
382
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
383
+ const int in = tid - step*im; // 0....15 or 0...7
384
+
385
+ const uint8_t m = 1 << (4*im);
386
+
387
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
388
+ const int q_offset = 32*im + l0;
389
+ const int y_offset = 128*im + l0;
390
+
391
+ uint16_t utmp[4];
392
+ const int8_t * s = (const int8_t *)utmp;
393
+
394
+ const uint16_t s_shift = 4*im;
395
+
396
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
397
+
398
+ const float * y = yy + i * QK_K + y_offset;
399
+ const uint8_t * q = x[i].qs + q_offset;
400
+ const uint8_t * h = x[i].hmask + l0;
401
+
402
+ const uint16_t * a = (const uint16_t *)x[i].scales;
403
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
404
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
405
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
406
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
407
+
408
+ const float d = x[i].d;
409
+
410
+ float sum = 0;
411
+ for (int l = 0; l < n; ++l) {
412
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
413
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
414
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
415
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
416
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
417
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
418
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
419
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
420
+ }
421
+ tmp += d * sum;
422
+
423
+ }
424
+ #else
425
+
426
+ const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
427
+ const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
428
+ const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
429
+ const int in = offset/8; // 0 or 1
430
+ const int im = offset%8; // 0...7
431
+
432
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
433
+
434
+ const float * y = yy + i * QK_K + offset;
435
+ const uint8_t * q = x[i].qs + offset;
436
+ const uint8_t * s = x[i].scales;
437
+
438
+ const float dall = (float)x[i].d;
439
+
440
+ float sum = 0;
441
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
442
+ const uint8_t hl = x[i].hmask[im+l] >> in;
443
+ const uint8_t ql = q[l];
444
+ sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
445
+ + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
446
+ + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
447
+ + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
448
+ }
449
+ tmp += sum;
450
+ }
451
+ #endif
452
+
453
+ // sum up partial sums and write back result
454
+ #pragma unroll
455
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
456
+ tmp +=
457
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
458
+ }
459
+
460
+ if (item_ct1.get_local_id(2) == 0) {
461
+ dst[row] = tmp;
462
+ }
463
+ }
464
+
465
+ /*
466
+ DPCT1110:6: The total declared local variable size in device function
467
+ dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
468
+ pressure. Consult with your hardware vendor to find the total register size
469
+ available and adjust the code, or use smaller sub-group size to avoid high
470
+ register pressure.
471
+ */
472
+ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
473
+ const float *__restrict__ yy,
474
+ float *__restrict__ dst,
475
+ const int ncols, int nrows,
476
+ const sycl::nd_item<3> &item_ct1) {
477
+
478
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
479
+ item_ct1.get_local_id(1);
480
+ if (row > nrows) return;
481
+ const int num_blocks_per_row = ncols / QK_K;
482
+ const int ib0 = row*num_blocks_per_row;
483
+
484
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
485
+
486
+ #if QK_K == 256
487
+ const uint16_t kmask1 = 0x3f3f;
488
+ const uint16_t kmask2 = 0x0f0f;
489
+ const uint16_t kmask3 = 0xc0c0;
490
+
491
+ const int tid =
492
+ item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
493
+ const int ix =
494
+ item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
495
+
496
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
497
+
498
+ const int il = tid/step; // 0...3
499
+ const int ir = tid - step*il; // 0...7 or 0...3
500
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
501
+
502
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
503
+ const int in = il%2;
504
+
505
+ const int l0 = n*(2*ir + in);
506
+ const int q_offset = 32*im + l0;
507
+ const int y_offset = 64*im + l0;
508
+
509
+ uint16_t aux[4];
510
+ const uint8_t * sc = (const uint8_t *)aux;
511
+
512
+ #if K_QUANTS_PER_ITERATION == 2
513
+ uint32_t q32[4];
514
+ const uint8_t * q4 = (const uint8_t *)q32;
515
+ #else
516
+ uint16_t q16[4];
517
+ const uint8_t * q4 = (const uint8_t *)q16;
518
+ #endif
519
+
520
+ float tmp = 0; // partial sum for thread in warp
521
+
522
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
523
+
524
+ const float * y1 = yy + i*QK_K + y_offset;
525
+ const float * y2 = y1 + 128;
526
+
527
+ const float dall = x[i].dm[0];
528
+ const float dmin = x[i].dm[1];
529
+
530
+ const uint16_t * a = (const uint16_t *)x[i].scales;
531
+ aux[0] = a[im+0] & kmask1;
532
+ aux[1] = a[im+2] & kmask1;
533
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
534
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
535
+
536
+ #if K_QUANTS_PER_ITERATION == 2
537
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
538
+ const uint32_t * q2 = q1 + 16;
539
+
540
+ q32[0] = q1[0] & 0x0f0f0f0f;
541
+ q32[1] = q1[0] & 0xf0f0f0f0;
542
+ q32[2] = q2[0] & 0x0f0f0f0f;
543
+ q32[3] = q2[0] & 0xf0f0f0f0;
544
+
545
+ sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
546
+ float smin = 0;
547
+ for (int l = 0; l < 4; ++l) {
548
+ s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
549
+ s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
550
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
551
+ }
552
+ tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
553
+ s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
554
+ dmin * smin;
555
+ #else
556
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
557
+ const uint16_t * q2 = q1 + 32;
558
+
559
+ q16[0] = q1[0] & 0x0f0f;
560
+ q16[1] = q1[0] & 0xf0f0;
561
+ q16[2] = q2[0] & 0x0f0f;
562
+ q16[3] = q2[0] & 0xf0f0;
563
+
564
+ float4 s = {0.f, 0.f, 0.f, 0.f};
565
+ float smin = 0;
566
+ for (int l = 0; l < 2; ++l) {
567
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
568
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
569
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
570
+ }
571
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
572
+ #endif
573
+
574
+ }
575
+ #else
576
+ const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
577
+ const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
578
+
579
+ const int step = tid * K_QUANTS_PER_ITERATION;
580
+
581
+ uint16_t aux16[2];
582
+ const uint8_t * s = (const uint8_t *)aux16;
583
+
584
+ float tmp = 0;
585
+
586
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
587
+ const uint8_t * q = x[i].qs + step;
588
+ const float * y = yy + i*QK_K + step;
589
+ const uint16_t * a = (const uint16_t *)x[i].scales;
590
+ aux16[0] = a[0] & 0x0f0f;
591
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
592
+ const float d = (float)x[i].dm[0];
593
+ const float m = (float)x[i].dm[1];
594
+ float sum = 0.f;
595
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
596
+ sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
597
+ + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
598
+ + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
599
+ + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
600
+ }
601
+ tmp += sum;
602
+ }
603
+
604
+ #endif
605
+
606
+ // sum up partial sums and write back result
607
+ #pragma unroll
608
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
609
+ tmp +=
610
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
611
+ }
612
+
613
+ if (tid == 0) {
614
+ dst[row] = tmp;
615
+ }
616
+ }
617
+
618
+ /*
619
+ DPCT1110:7: The total declared local variable size in device function
620
+ dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
621
+ pressure. Consult with your hardware vendor to find the total register size
622
+ available and adjust the code, or use smaller sub-group size to avoid high
623
+ register pressure.
624
+ */
625
+ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
626
+ const float *__restrict__ yy,
627
+ float *__restrict__ dst,
628
+ const int ncols,
629
+ const sycl::nd_item<3> &item_ct1) {
630
+
631
+ const int row = item_ct1.get_group(2);
632
+ const int num_blocks_per_row = ncols / QK_K;
633
+ const int ib0 = row*num_blocks_per_row;
634
+
635
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
636
+
637
+ float tmp = 0; // partial sum for thread in warp
638
+
639
+ #if QK_K == 256
640
+ const uint16_t kmask1 = 0x3f3f;
641
+ const uint16_t kmask2 = 0x0f0f;
642
+ const uint16_t kmask3 = 0xc0c0;
643
+
644
+ const int tid = item_ct1.get_local_id(2) / 2; // 0...15
645
+ const int ix = item_ct1.get_local_id(2) % 2;
646
+
647
+ const int il = tid/4; // 0...3
648
+ const int ir = tid - 4*il;// 0...3
649
+ const int n = 2;
650
+
651
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
652
+ const int in = il%2;
653
+
654
+ const int l0 = n*(2*ir + in);
655
+ const int q_offset = 32*im + l0;
656
+ const int y_offset = 64*im + l0;
657
+
658
+ const uint8_t hm1 = 1 << (2*im);
659
+ const uint8_t hm2 = hm1 << 4;
660
+
661
+ uint16_t aux[4];
662
+ const uint8_t * sc = (const uint8_t *)aux;
663
+
664
+ uint16_t q16[8];
665
+ const uint8_t * q4 = (const uint8_t *)q16;
666
+
667
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
668
+
669
+ const uint8_t * ql1 = x[i].qs + q_offset;
670
+ const uint8_t * qh = x[i].qh + l0;
671
+ const float * y1 = yy + i*QK_K + y_offset;
672
+ const float * y2 = y1 + 128;
673
+
674
+ const float dall = x[i].dm[0];
675
+ const float dmin = x[i].dm[1];
676
+
677
+ const uint16_t * a = (const uint16_t *)x[i].scales;
678
+ aux[0] = a[im+0] & kmask1;
679
+ aux[1] = a[im+2] & kmask1;
680
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
681
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
682
+
683
+ sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
684
+ float smin = 0;
685
+ const uint16_t * q1 = (const uint16_t *)ql1;
686
+ const uint16_t * q2 = q1 + 32;
687
+ q16[0] = q1[0] & 0x0f0f;
688
+ q16[1] = q1[8] & 0x0f0f;
689
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
690
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
691
+ q16[4] = q2[0] & 0x0f0f;
692
+ q16[5] = q2[8] & 0x0f0f;
693
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
694
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
695
+ for (int l = 0; l < n; ++l) {
696
+ sum.x() +=
697
+ y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
698
+ y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
699
+ sum.y() +=
700
+ y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
701
+ y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
702
+ sum.z() +=
703
+ y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
704
+ y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
705
+ sum.w() +=
706
+ y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
707
+ y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
708
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
709
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
710
+ }
711
+ tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
712
+ sum.w() * sc[5]) -
713
+ dmin * smin;
714
+ }
715
+
716
+ #else
717
+ const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
718
+ const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
719
+ const int step = tid * K_QUANTS_PER_ITERATION;
720
+ const int im = step/8;
721
+ const int in = step%8;
722
+
723
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
724
+ const uint8_t * q = x[i].qs + step;
725
+ const int8_t * s = x[i].scales;
726
+ const float * y = yy + i*QK_K + step;
727
+ const float d = x[i].d;
728
+ float sum = 0.f;
729
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
730
+ const uint8_t h = x[i].qh[in+j] >> im;
731
+ sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
732
+ + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
733
+ + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
734
+ + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
735
+ }
736
+ tmp += sum;
737
+ }
738
+ #endif
739
+
740
+ // sum up partial sums and write back result
741
+ #pragma unroll
742
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
743
+ tmp +=
744
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
745
+ }
746
+
747
+ if (item_ct1.get_local_id(2) == 0) {
748
+ dst[row] = tmp;
749
+ }
750
+ }
751
+
752
+ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
753
+ const sycl::nd_item<3> &item_ct1) {
754
+
755
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
756
+
757
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
758
+ item_ct1.get_local_id(1);
759
+ if (row > nrows) return;
760
+
761
+ const int num_blocks_per_row = ncols / QK_K;
762
+ const int ib0 = row*num_blocks_per_row;
763
+
764
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
765
+
766
+ #if QK_K == 256
767
+
768
+ const int tid =
769
+ item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
770
+ const int ix =
771
+ item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
772
+
773
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
774
+
775
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
776
+ const int in = tid - step*im; // 0...15 or 0...7
777
+
778
+ #if K_QUANTS_PER_ITERATION == 1
779
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
780
+ const int is = 0;
781
+ #else
782
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
783
+ const int is = in / 4;
784
+ #endif
785
+ const int ql_offset = 64*im + l0;
786
+ const int qh_offset = 32*im + l0;
787
+ const int s_offset = 8*im + is;
788
+ const int y_offset = 128*im + l0;
789
+
790
+ float tmp = 0; // partial sum for thread in warp
791
+
792
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
793
+
794
+ const float * y = yy + i * QK_K + y_offset;
795
+ const uint8_t * ql = x[i].ql + ql_offset;
796
+ const uint8_t * qh = x[i].qh + qh_offset;
797
+ const int8_t * s = x[i].scales + s_offset;
798
+
799
+ const float d = x[i].d;
800
+
801
+ #if K_QUANTS_PER_ITERATION == 1
802
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
803
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
804
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
805
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
806
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
807
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
808
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
809
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
810
+ tmp += sum;
811
+ #else
812
+ float sum = 0;
813
+ for (int l = 0; l < 4; ++l) {
814
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
815
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
816
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
817
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
818
+ }
819
+ tmp += sum;
820
+ #endif
821
+
822
+ }
823
+
824
+ #else
825
+
826
+ const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
827
+ const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
828
+
829
+ const int step = tid * K_QUANTS_PER_ITERATION;
830
+
831
+ float tmp = 0; // partial sum for thread in warp
832
+
833
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
834
+
835
+ const float * y = yy + i * QK_K + step;
836
+ const uint8_t * ql = x[i].ql + step;
837
+ const uint8_t * qh = x[i].qh + step;
838
+ const int8_t * s = x[i].scales;
839
+
840
+ const float d = x[i+0].d;
841
+
842
+ float sum = 0;
843
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
844
+ sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
845
+ + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
846
+ + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
847
+ + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
848
+ }
849
+ tmp += sum;
850
+
851
+ }
852
+
853
+ #endif
854
+
855
+ // sum up partial sums and write back result
856
+ #pragma unroll
857
+ for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
858
+ tmp +=
859
+ dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
860
+ }
861
+
862
+ if (tid == 0) {
863
+ dst[row] = tmp;
864
+ }
865
+ }
866
+
867
+ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
868
+ float *dst, const int ncols,
869
+ const int nrows,
870
+ dpct::queue_ptr stream) {
871
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
872
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
873
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
874
+ const sycl::range<3> block_nums(1, 1, block_num_y);
875
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
876
+ {
877
+ dpct::has_capability_or_fail(stream->get_device(),
878
+ {sycl::aspect::fp16});
879
+
880
+ stream->parallel_for(
881
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
882
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
883
+ dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
884
+ vx, y, dst, ncols, nrows, item_ct1);
885
+ });
886
+ }
887
+ }
888
+
889
+
890
+ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
891
+ float *dst, const int ncols,
892
+ const int nrows,
893
+ dpct::queue_ptr stream) {
894
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
895
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
896
+ // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
897
+ const sycl::range<3> block_nums(1, 1, block_num_y);
898
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
899
+ {
900
+ dpct::has_capability_or_fail(stream->get_device(),
901
+ {sycl::aspect::fp16});
902
+
903
+ stream->parallel_for(
904
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
905
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
906
+ dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
907
+ vx, y, dst, ncols, nrows, item_ct1);
908
+ });
909
+ }
910
+ }
911
+
912
+ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
913
+ float *dst, const int ncols,
914
+ const int nrows,
915
+ dpct::queue_ptr stream) {
916
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
917
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
918
+ const sycl::range<3> block_nums(1, 1, block_num_y);
919
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
920
+ {
921
+ dpct::has_capability_or_fail(stream->get_device(),
922
+ {sycl::aspect::fp16});
923
+
924
+ stream->parallel_for(
925
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
926
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
927
+ dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
928
+ vx, y, dst, ncols, nrows, item_ct1);
929
+ });
930
+ }
931
+ }
932
+
933
+ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
934
+ float *dst, const int ncols,
935
+ const int nrows,
936
+ dpct::queue_ptr stream) {
937
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
938
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
939
+ const sycl::range<3> block_nums(1, 1, block_num_y);
940
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
941
+ {
942
+ dpct::has_capability_or_fail(stream->get_device(),
943
+ {sycl::aspect::fp16});
944
+
945
+ stream->parallel_for(
946
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
947
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
948
+ dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
949
+ vx, y, dst, ncols, nrows, item_ct1);
950
+ });
951
+ }
952
+ }
953
+
954
+ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
955
+ float *dst, const int ncols,
956
+ const int nrows,
957
+ dpct::queue_ptr stream) {
958
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
959
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
960
+ const sycl::range<3> block_nums(1, 1, block_num_y);
961
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
962
+ {
963
+ dpct::has_capability_or_fail(stream->get_device(),
964
+ {sycl::aspect::fp16});
965
+
966
+ stream->parallel_for(
967
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
968
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
969
+ dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
970
+ vx, y, dst, ncols, nrows, item_ct1);
971
+ });
972
+ }
973
+ }
974
+
975
+ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
976
+ float *dst, const int ncols,
977
+ const int nrows,
978
+ dpct::queue_ptr stream) {
979
+ GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
980
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
981
+ const sycl::range<3> block_nums(1, 1, block_num_y);
982
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
983
+ {
984
+ dpct::has_capability_or_fail(stream->get_device(),
985
+ {sycl::aspect::fp16});
986
+
987
+ stream->parallel_for(
988
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
989
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
990
+ dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
991
+ vx, y, dst, ncols, nrows, item_ct1);
992
+ });
993
+ }
994
+ }
995
+
996
+ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
997
+ float *dst, const int ncols,
998
+ const int nrows,
999
+ dpct::queue_ptr stream) {
1000
+ GGML_ASSERT(ncols % QK_K == 0);
1001
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1002
+ const int block_num_y = (nrows + ny - 1) / ny;
1003
+ const sycl::range<3> block_nums(1, 1, block_num_y);
1004
+ const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1005
+ stream->parallel_for(
1006
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
1007
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1008
+ dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
1009
+ });
1010
+ }
1011
+
1012
+ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
1013
+ float *dst, const int ncols,
1014
+ const int nrows,
1015
+ dpct::queue_ptr stream) {
1016
+ GGML_ASSERT(ncols % QK_K == 0);
1017
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1018
+ const int block_num_y = (nrows + ny - 1) / ny;
1019
+ const sycl::range<3> block_nums(1, 1, block_num_y);
1020
+ const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1021
+ stream->parallel_for(
1022
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
1023
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1024
+ dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
1025
+ });
1026
+ }
1027
+
1028
+ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
1029
+ float *dst, const int ncols,
1030
+ const int nrows,
1031
+ dpct::queue_ptr stream) {
1032
+ GGML_ASSERT(ncols % QK_K == 0);
1033
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1034
+ const int block_num_y = (nrows + ny - 1) / ny;
1035
+ const sycl::range<3> block_nums(1, 1, block_num_y);
1036
+ const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1037
+ stream->parallel_for(
1038
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
1039
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1040
+ dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
1041
+ });
1042
+ }
1043
+
1044
+ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
1045
+ float *dst, const int ncols,
1046
+ const int nrows,
1047
+ dpct::queue_ptr stream) {
1048
+ GGML_ASSERT(ncols % QK_K == 0);
1049
+ const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
1050
+ stream->parallel_for(
1051
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
1052
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1053
+ dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
1054
+ });
1055
+ }
1056
+
1057
+ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
1058
+ float *dst, const int ncols,
1059
+ const int nrows,
1060
+ dpct::queue_ptr stream) {
1061
+ GGML_ASSERT(ncols % QK_K == 0);
1062
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1063
+ const int block_num_y = (nrows + ny - 1) / ny;
1064
+ const sycl::range<3> block_nums(1, 1, block_num_y);
1065
+ const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
1066
+ stream->parallel_for(
1067
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
1068
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
1069
+ dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
1070
+ });
1071
+ }
1072
+
1073
+ void ggml_sycl_op_dequantize_mul_mat_vec(
1074
+ ggml_backend_sycl_context & ctx,
1075
+ const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
1076
+ const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
1077
+ float *dst_dd_i, const int64_t row_low, const int64_t row_high,
1078
+ const int64_t src1_ncols, const int64_t src1_padded_row_size,
1079
+ const dpct::queue_ptr &stream) {
1080
+
1081
+ const int64_t ne00 = src0->ne[0];
1082
+ const int64_t row_diff = row_high - row_low;
1083
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1084
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1085
+ #ifdef GGML_SYCL_F16
1086
+ ggml_sycl_pool_alloc<sycl::half> src1_dfloat_a(ctx.pool());
1087
+ sycl::half *src1_dfloat = nullptr; // dfloat == half
1088
+
1089
+ bool src1_convert_f16 =
1090
+ src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1091
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1092
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1093
+
1094
+ if (src1_convert_f16) {
1095
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
1096
+ " : converting src1 to fp16");
1097
+ src1_dfloat = src1_dfloat_a.alloc(ne00);
1098
+ const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1099
+ GGML_ASSERT(to_fp16_sycl != nullptr);
1100
+ to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
1101
+ }
1102
+ #else
1103
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
1104
+ #endif // GGML_SYCL_F16
1105
+
1106
+ switch (src0->type) {
1107
+ case GGML_TYPE_Q4_0:
1108
+ if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
1109
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
1110
+ dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1111
+ } else {
1112
+ dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1113
+ }
1114
+ break;
1115
+ case GGML_TYPE_Q4_1:
1116
+ dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1117
+ break;
1118
+ case GGML_TYPE_Q5_0:
1119
+ dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1120
+ break;
1121
+ case GGML_TYPE_Q5_1:
1122
+ dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1123
+ break;
1124
+ case GGML_TYPE_Q8_0:
1125
+ dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1126
+ break;
1127
+ case GGML_TYPE_Q2_K:
1128
+ dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1129
+ break;
1130
+ case GGML_TYPE_Q3_K:
1131
+ dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1132
+ break;
1133
+ case GGML_TYPE_Q4_K:
1134
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1135
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1136
+ // reorder is currently not supported for dmmv
1137
+ GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
1138
+ } else {
1139
+ dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1140
+ }
1141
+ break;
1142
+ case GGML_TYPE_Q5_K:
1143
+ dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1144
+ break;
1145
+ case GGML_TYPE_Q6_K:
1146
+ dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1147
+ break;
1148
+ case GGML_TYPE_F16:
1149
+ convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
1150
+ break;
1151
+ default:
1152
+ printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1153
+ GGML_ABORT("fatal error");
1154
+ }
1155
+
1156
+ GGML_UNUSED(src1);
1157
+ GGML_UNUSED(dst);
1158
+ GGML_UNUSED(src1_ddq_i);
1159
+ GGML_UNUSED(src1_ncols);
1160
+ GGML_UNUSED(src1_padded_row_size);
1161
+ GGML_UNUSED(ctx);
1162
+ }