whispercpp 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (797) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -3
  3. data/README.md +92 -31
  4. data/Rakefile +26 -7
  5. data/ext/.gitignore +5 -7
  6. data/ext/dependencies.rb +61 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +221 -0
  9. data/ext/ruby_whisper.c +159 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +641 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1301 -0
  15. data/ext/ruby_whisper_segment.c +143 -0
  16. data/ext/ruby_whisper_transcribe.cpp +87 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/.dockerignore +3 -0
  19. data/ext/sources/.github/workflows/bindings-ruby.yml +21 -0
  20. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  21. data/ext/sources/CMakeLists.txt +251 -0
  22. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  23. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  24. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  25. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  26. data/ext/sources/bindings/javascript/package.json +26 -0
  27. data/ext/sources/bindings/javascript/whisper.js +19 -0
  28. data/ext/sources/build-xcframework.sh +547 -0
  29. data/ext/sources/ci/run.sh +336 -0
  30. data/ext/sources/close-issue.yml +28 -0
  31. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  32. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  33. data/ext/sources/cmake/build-info.cmake +60 -0
  34. data/ext/sources/cmake/git-vars.cmake +22 -0
  35. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  36. data/ext/sources/cmake/whisper.pc.in +10 -0
  37. data/ext/sources/examples/CMakeLists.txt +124 -0
  38. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  39. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +37 -0
  40. data/ext/sources/examples/addon.node/addon.cpp +438 -0
  41. data/ext/sources/examples/addon.node/index.js +54 -0
  42. data/ext/sources/examples/addon.node/package.json +16 -0
  43. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  44. data/ext/sources/examples/bench/bench.cpp +175 -0
  45. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  46. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  47. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  48. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  49. data/ext/sources/examples/cli/cli.cpp +1294 -0
  50. data/ext/sources/examples/coi-serviceworker.js +146 -0
  51. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  52. data/ext/sources/examples/command/command.cpp +776 -0
  53. data/ext/sources/examples/command/commands.txt +9 -0
  54. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  55. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  56. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  57. data/ext/sources/examples/common-ggml.cpp +238 -0
  58. data/ext/sources/examples/common-ggml.h +18 -0
  59. data/ext/sources/examples/common-sdl.cpp +227 -0
  60. data/ext/sources/examples/common-sdl.h +49 -0
  61. data/ext/sources/examples/common-whisper.cpp +168 -0
  62. data/ext/sources/examples/common-whisper.h +24 -0
  63. data/ext/sources/examples/common.cpp +675 -0
  64. data/ext/sources/examples/common.h +322 -0
  65. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  66. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  67. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  68. data/ext/sources/examples/generate-karaoke.sh +57 -0
  69. data/ext/sources/examples/grammar-parser.cpp +423 -0
  70. data/ext/sources/examples/grammar-parser.h +29 -0
  71. data/ext/sources/examples/helpers.js +191 -0
  72. data/ext/sources/examples/json.hpp +24596 -0
  73. data/ext/sources/examples/livestream.sh +112 -0
  74. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  75. data/ext/sources/examples/lsp/lsp.cpp +467 -0
  76. data/ext/sources/examples/lsp/whisper.vim +362 -0
  77. data/ext/sources/examples/miniaudio.h +93468 -0
  78. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  79. data/ext/sources/examples/python/whisper_processor.py +54 -0
  80. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  81. data/ext/sources/examples/quantize/quantize.cpp +223 -0
  82. data/ext/sources/examples/server/CMakeLists.txt +12 -0
  83. data/ext/sources/examples/server/bench.js +29 -0
  84. data/ext/sources/examples/server/httplib.h +10497 -0
  85. data/ext/sources/examples/server/server.cpp +1091 -0
  86. data/ext/sources/examples/server.py +115 -0
  87. data/ext/sources/examples/stb_vorbis.c +5584 -0
  88. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  89. data/ext/sources/examples/stream/stream.cpp +429 -0
  90. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  91. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  92. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  93. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  94. data/ext/sources/examples/sycl/build.sh +22 -0
  95. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  96. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  97. data/ext/sources/examples/talk-llama/CMakeLists.txt +40 -0
  98. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  99. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  100. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  101. data/ext/sources/examples/talk-llama/llama-arch.cpp +1746 -0
  102. data/ext/sources/examples/talk-llama/llama-arch.h +437 -0
  103. data/ext/sources/examples/talk-llama/llama-batch.cpp +374 -0
  104. data/ext/sources/examples/talk-llama/llama-batch.h +89 -0
  105. data/ext/sources/examples/talk-llama/llama-chat.cpp +663 -0
  106. data/ext/sources/examples/talk-llama/llama-chat.h +58 -0
  107. data/ext/sources/examples/talk-llama/llama-context.cpp +2676 -0
  108. data/ext/sources/examples/talk-llama/llama-context.h +276 -0
  109. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  110. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  111. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  112. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  113. data/ext/sources/examples/talk-llama/llama-graph.cpp +1618 -0
  114. data/ext/sources/examples/talk-llama/llama-graph.h +640 -0
  115. data/ext/sources/examples/talk-llama/llama-hparams.cpp +95 -0
  116. data/ext/sources/examples/talk-llama/llama-hparams.h +190 -0
  117. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  118. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  119. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  120. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2739 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +502 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +379 -0
  124. data/ext/sources/examples/talk-llama/llama-memory.cpp +1 -0
  125. data/ext/sources/examples/talk-llama/llama-memory.h +32 -0
  126. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  127. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  128. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1138 -0
  129. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  130. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +281 -0
  131. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  132. data/ext/sources/examples/talk-llama/llama-model.cpp +13814 -0
  133. data/ext/sources/examples/talk-llama/llama-model.h +425 -0
  134. data/ext/sources/examples/talk-llama/llama-quant.cpp +966 -0
  135. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  136. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  137. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  138. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3340 -0
  139. data/ext/sources/examples/talk-llama/llama-vocab.h +131 -0
  140. data/ext/sources/examples/talk-llama/llama.cpp +354 -0
  141. data/ext/sources/examples/talk-llama/llama.h +1377 -0
  142. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  143. data/ext/sources/examples/talk-llama/speak +40 -0
  144. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  145. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  146. data/ext/sources/examples/talk-llama/talk-llama.cpp +808 -0
  147. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  148. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  149. data/ext/sources/examples/talk-llama/unicode.cpp +849 -0
  150. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  151. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  152. data/ext/sources/examples/vad-speech-segments/speech.cpp +143 -0
  153. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  154. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  155. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  156. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  157. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  158. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  159. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  160. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  161. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +249 -0
  162. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  163. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  164. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  165. data/ext/sources/ggml/CMakeLists.txt +390 -0
  166. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  167. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  168. data/ext/sources/ggml/cmake/common.cmake +26 -0
  169. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  170. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  171. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +9 -7
  172. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  173. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +9 -1
  174. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  176. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  178. data/ext/{ggml → sources/ggml}/include/ggml.h +182 -265
  179. data/ext/sources/ggml/include/gguf.h +202 -0
  180. data/ext/sources/ggml/src/CMakeLists.txt +346 -0
  181. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  182. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  183. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  184. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +87 -53
  185. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +26 -14
  186. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  187. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +74 -0
  188. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  189. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  190. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  191. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  192. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +135 -1
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +564 -146
  195. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  196. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  197. data/ext/{ggml → sources/ggml}/src/ggml-common.h +12 -8
  198. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +504 -0
  199. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +2 -1
  200. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  201. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  202. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  203. data/ext/sources/ggml/src/ggml-cpu/common.h +72 -0
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/cpu-feats-x86.cpp +5 -1
  205. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +6431 -0
  206. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +163 -41
  207. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.c +4029 -1117
  208. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3510 -0
  209. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +67 -18
  210. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  211. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  212. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  213. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  214. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3544 -0
  215. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  216. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +8903 -0
  217. data/ext/sources/ggml/src/ggml-cpu/ops.h +110 -0
  218. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  219. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  220. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  221. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +252 -0
  222. data/ext/sources/ggml/src/ggml-cpu/vec.h +818 -0
  223. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  224. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  225. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  227. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  229. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  231. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  232. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  233. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  234. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  235. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  236. data/ext/sources/ggml/src/ggml-cuda/common.cuh +828 -0
  237. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  238. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  239. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  240. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  241. data/ext/sources/ggml/src/ggml-cuda/convert.cu +730 -0
  242. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +26 -0
  243. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  244. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  245. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  246. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  247. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  248. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  249. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  251. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  252. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  254. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1471 -0
  255. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  256. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  257. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  258. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  259. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  260. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  261. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +634 -0
  262. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  263. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  264. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  265. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  266. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  267. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3505 -0
  268. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  269. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  270. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  271. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  272. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  273. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  274. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  275. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +336 -0
  276. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +12 -0
  277. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  278. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  279. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  280. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  281. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  282. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  283. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  284. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  286. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  287. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  288. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  289. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  290. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  291. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  292. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  293. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  294. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  295. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  296. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  297. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  298. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  299. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +153 -0
  300. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  301. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  302. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  303. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +39 -0
  304. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +5 -0
  305. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  306. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  307. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  308. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  309. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  310. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  312. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  313. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  314. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  315. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  316. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  317. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  318. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  319. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  320. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  321. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  322. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  323. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  324. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  325. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  326. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  327. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  328. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  329. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  330. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  332. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  430. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/unary.cu +289 -0
  432. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +59 -0
  433. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  434. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  436. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  437. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  438. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  439. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  440. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  441. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +131 -0
  442. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +64 -19
  443. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  444. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  445. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  446. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  447. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  448. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  449. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  450. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  451. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  452. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  453. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  454. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  455. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  456. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  457. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  458. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  459. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  460. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  461. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  462. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  463. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  464. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  465. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  466. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  467. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  468. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  469. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  470. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  471. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  481. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +120 -0
  482. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +622 -0
  483. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2178 -1064
  484. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +1575 -1218
  485. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  486. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  487. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  488. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +96 -0
  489. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +5124 -0
  490. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  491. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  492. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  493. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  494. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  495. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  496. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  497. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  498. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  499. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  500. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  501. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  502. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  503. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  504. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  505. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  506. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  507. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  508. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  509. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  510. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  511. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  512. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  513. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  514. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  515. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  516. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  517. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  526. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  527. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +114 -120
  528. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  529. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +480 -73
  530. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  531. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  532. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +345 -0
  533. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  534. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  535. data/ext/sources/ggml/src/ggml-sycl/common.hpp +589 -0
  536. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +32 -33
  537. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  538. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +4 -2
  539. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  540. data/ext/{ggml → sources/ggml}/src/ggml-sycl/convert.cpp +104 -28
  541. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  542. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +700 -0
  543. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  544. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +791 -0
  545. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +156 -17
  546. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  547. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2957 -0
  548. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1511 -0
  549. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +75 -0
  550. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +99 -0
  551. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +309 -0
  552. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  553. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1004 -1240
  554. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  555. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  556. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  557. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  558. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +0 -1
  559. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  560. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmvq.cpp +261 -166
  561. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  562. data/ext/{ggml → sources/ggml}/src/ggml-sycl/norm.cpp +204 -81
  563. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  564. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  565. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  566. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  567. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +83 -0
  568. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +361 -0
  569. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  570. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +35 -25
  571. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  572. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  573. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  574. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +3 -3
  575. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  576. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1215 -0
  577. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +293 -0
  578. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  579. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +196 -0
  580. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  581. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3130 -1087
  582. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +39 -0
  583. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  584. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  585. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  586. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  587. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  588. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  589. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  590. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  591. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  592. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  593. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  594. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  595. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  596. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  597. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  598. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  599. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  600. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  601. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  602. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  603. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  604. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  605. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  606. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  607. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  608. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  609. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  610. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  611. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  612. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  613. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  614. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  615. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  616. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  617. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  618. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  619. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  620. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  623. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +52 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  692. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +193 -35
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  695. data/ext/{ggml → sources/ggml}/src/ggml.c +676 -1820
  696. data/ext/sources/ggml/src/gguf.cpp +1330 -0
  697. data/ext/{include → sources/include}/whisper.h +68 -2
  698. data/ext/sources/src/CMakeLists.txt +143 -0
  699. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  700. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +35 -10
  701. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  702. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +28 -3
  703. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  704. data/ext/sources/src/whisper-arch.h +197 -0
  705. data/ext/{src → sources/src}/whisper.cpp +1905 -374
  706. data/ext/sources/tests/CMakeLists.txt +105 -0
  707. data/ext/sources/tests/earnings21/eval.mk +58 -0
  708. data/ext/sources/tests/earnings21/eval.py +68 -0
  709. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  710. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  711. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  712. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  713. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  714. data/ext/sources/tests/en-0-ref.txt +1 -0
  715. data/ext/sources/tests/en-1-ref.txt +1 -0
  716. data/ext/sources/tests/en-2-ref.txt +1 -0
  717. data/ext/sources/tests/es-0-ref.txt +1 -0
  718. data/ext/sources/tests/librispeech/eval.mk +39 -0
  719. data/ext/sources/tests/librispeech/eval.py +47 -0
  720. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  721. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  722. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  723. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  724. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  725. data/ext/sources/tests/run-tests.sh +130 -0
  726. data/ext/sources/tests/test-c.c +3 -0
  727. data/ext/sources/tests/test-vad-full.cpp +54 -0
  728. data/ext/sources/tests/test-vad.cpp +83 -0
  729. data/ext/sources/tests/test-whisper.js +58 -0
  730. data/extsources.rb +33 -5
  731. data/lib/whisper/model/uri.rb +149 -128
  732. data/sig/whisper.rbs +480 -0
  733. data/tests/helper.rb +28 -0
  734. data/tests/test_callback.rb +45 -3
  735. data/tests/test_error.rb +2 -2
  736. data/tests/test_model.rb +38 -0
  737. data/tests/test_package.rb +18 -3
  738. data/tests/test_params.rb +145 -8
  739. data/tests/test_segment.rb +10 -19
  740. data/tests/test_vad.rb +19 -0
  741. data/tests/test_vad_params.rb +103 -0
  742. data/tests/test_whisper.rb +37 -37
  743. data/whispercpp.gemspec +5 -4
  744. metadata +766 -111
  745. data/ext/cpu.mk +0 -9
  746. data/ext/examples/dr_wav.h +0 -8815
  747. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  748. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  749. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  750. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  751. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  752. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  753. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  754. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  755. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  756. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  757. data/ext/metal-embed.mk +0 -17
  758. data/ext/metal.mk +0 -6
  759. data/ext/ruby_whisper.cpp +0 -1909
  760. data/ext/scripts/get-flags.mk +0 -38
  761. data/lib/whisper.rb +0 -2
  762. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  763. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  764. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  765. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  766. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  767. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  768. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  769. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  770. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  771. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  772. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  773. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  774. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  775. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  776. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  777. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  778. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  779. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  780. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  781. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  782. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  783. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +0 -0
  784. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  785. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-aarch64.h +0 -0
  786. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.cpp +0 -0
  787. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-hbm.h +0 -0
  788. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-quants.h +0 -0
  789. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.cpp +0 -0
  790. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-traits.h +0 -0
  791. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  792. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  793. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  794. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  795. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  796. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  797. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
@@ -0,0 +1,892 @@
1
+ #pragma once
2
+
3
+ #include "ggml-cpu-impl.h"
4
+
5
+ //
6
+ // simd mappings
7
+ //
8
+
9
+ // we define a common set of C macros which map to specific intrinsics based on the current architecture
10
+ // we then implement the fundamental computation operations below using only these macros
11
+ // adding support for new architectures requires to define the corresponding SIMD macros
12
+ //
13
+ // GGML_F32_STEP / GGML_F16_STEP
14
+ // number of elements to process in a single step
15
+ //
16
+ // GGML_F32_EPR / GGML_F16_EPR
17
+ // number of elements to fit in a single register
18
+ //
19
+
20
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
21
+
22
+ #define GGML_SIMD
23
+
24
+ // F32 NEON
25
+
26
+ #define GGML_F32_STEP 16
27
+ #define GGML_F32_EPR 4
28
+
29
+ #define GGML_F32x4 float32x4_t
30
+ #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
31
+ #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
32
+ #define GGML_F32x4_LOAD vld1q_f32
33
+ #define GGML_F32x4_STORE vst1q_f32
34
+ #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
35
+ #define GGML_F32x4_ADD vaddq_f32
36
+ #define GGML_F32x4_MUL vmulq_f32
37
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
38
+ #define GGML_F32x4_REDUCE(res, x) \
39
+ { \
40
+ int offset = GGML_F32_ARR >> 1; \
41
+ for (int i = 0; i < offset; ++i) { \
42
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
43
+ } \
44
+ offset >>= 1; \
45
+ for (int i = 0; i < offset; ++i) { \
46
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
47
+ } \
48
+ offset >>= 1; \
49
+ for (int i = 0; i < offset; ++i) { \
50
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
51
+ } \
52
+ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
53
+ }
54
+
55
+ #define GGML_F32_VEC GGML_F32x4
56
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
57
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
58
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
59
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
60
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
61
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
62
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
63
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
64
+
65
+ // F16 NEON
66
+
67
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
68
+ #define GGML_F16_STEP 32
69
+ #define GGML_F16_EPR 8
70
+
71
+ #define GGML_F16x8 float16x8_t
72
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
73
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
74
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
75
+ #define GGML_F16x8_STORE vst1q_f16
76
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
77
+ #define GGML_F16x8_ADD vaddq_f16
78
+ #define GGML_F16x8_MUL vmulq_f16
79
+ #define GGML_F16x8_REDUCE(res, x) \
80
+ do { \
81
+ int offset = GGML_F16_ARR >> 1; \
82
+ for (int i = 0; i < offset; ++i) { \
83
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
84
+ } \
85
+ offset >>= 1; \
86
+ for (int i = 0; i < offset; ++i) { \
87
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
88
+ } \
89
+ offset >>= 1; \
90
+ for (int i = 0; i < offset; ++i) { \
91
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
92
+ } \
93
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
94
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
95
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
96
+ } while (0)
97
+
98
+ #define GGML_F16_VEC GGML_F16x8
99
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
100
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
101
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
102
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
103
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
104
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
105
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
106
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
107
+ #else
108
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
109
+ // and take advantage of the vcvt_ functions to convert to/from FP16
110
+
111
+ #define GGML_F16_STEP 16
112
+ #define GGML_F16_EPR 4
113
+
114
+ #define GGML_F32Cx4 float32x4_t
115
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
116
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
117
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
118
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
119
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
120
+ #define GGML_F32Cx4_ADD vaddq_f32
121
+ #define GGML_F32Cx4_MUL vmulq_f32
122
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
123
+
124
+ #define GGML_F16_VEC GGML_F32Cx4
125
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
126
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
127
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
128
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
129
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
130
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
131
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
132
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
133
+ #endif
134
+
135
+ #elif defined(__AVX512F__)
136
+
137
+ #define GGML_SIMD
138
+
139
+ // F32 AVX512
140
+
141
+ #define GGML_F32_STEP 64
142
+ #define GGML_F32_EPR 16
143
+
144
+ #define GGML_F32x16 __m512
145
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
146
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
147
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
148
+ #define GGML_F32x16_STORE _mm512_storeu_ps
149
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
150
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
151
+ #define GGML_F32x16_ADD _mm512_add_ps
152
+ #define GGML_F32x16_MUL _mm512_mul_ps
153
+ #define GGML_F32x16_REDUCE(res, x) \
154
+ do { \
155
+ int offset = GGML_F32_ARR >> 1; \
156
+ for (int i = 0; i < offset; ++i) { \
157
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
158
+ } \
159
+ offset >>= 1; \
160
+ for (int i = 0; i < offset; ++i) { \
161
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
162
+ } \
163
+ offset >>= 1; \
164
+ for (int i = 0; i < offset; ++i) { \
165
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
166
+ } \
167
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
168
+ } while (0)
169
+
170
+ // TODO: is this optimal ?
171
+
172
+ #define GGML_F32_VEC GGML_F32x16
173
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
174
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
175
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
176
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
177
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
178
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
179
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
180
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
181
+
182
+ // F16 AVX512
183
+
184
+ // F16 AVX
185
+
186
+ #define GGML_F16_STEP 64
187
+ #define GGML_F16_EPR 16
188
+
189
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
190
+
191
+ #define GGML_F32Cx16 __m512
192
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
193
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
194
+
195
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
196
+ // so F16C guard isn't required
197
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
198
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
199
+
200
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
201
+ #define GGML_F32Cx16_ADD _mm512_add_ps
202
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
203
+ #define GGML_F32Cx16_REDUCE(res, x) \
204
+ do { \
205
+ int offset = GGML_F32_ARR >> 1; \
206
+ for (int i = 0; i < offset; ++i) { \
207
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
208
+ } \
209
+ offset >>= 1; \
210
+ for (int i = 0; i < offset; ++i) { \
211
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
212
+ } \
213
+ offset >>= 1; \
214
+ for (int i = 0; i < offset; ++i) { \
215
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
216
+ } \
217
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
218
+ } while (0)
219
+
220
+ #define GGML_F16_VEC GGML_F32Cx16
221
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
222
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
223
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
224
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
225
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
226
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
227
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
228
+
229
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
230
+ #elif defined(__AVX__)
231
+
232
+ #define GGML_SIMD
233
+
234
+ // F32 AVX
235
+
236
+ #define GGML_F32_STEP 32
237
+ #define GGML_F32_EPR 8
238
+
239
+ #define GGML_F32x8 __m256
240
+ #define GGML_F32x8_ZERO _mm256_setzero_ps()
241
+ #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
242
+ #define GGML_F32x8_LOAD _mm256_loadu_ps
243
+ #define GGML_F32x8_STORE _mm256_storeu_ps
244
+ #if defined(__FMA__)
245
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
246
+ #else
247
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
248
+ #endif
249
+ #define GGML_F32x8_ADD _mm256_add_ps
250
+ #define GGML_F32x8_MUL _mm256_mul_ps
251
+ #define GGML_F32x8_REDUCE(res, x) \
252
+ do { \
253
+ int offset = GGML_F32_ARR >> 1; \
254
+ for (int i = 0; i < offset; ++i) { \
255
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
256
+ } \
257
+ offset >>= 1; \
258
+ for (int i = 0; i < offset; ++i) { \
259
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
260
+ } \
261
+ offset >>= 1; \
262
+ for (int i = 0; i < offset; ++i) { \
263
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
264
+ } \
265
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
266
+ _mm256_extractf128_ps(x[0], 1)); \
267
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
268
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
269
+ } while (0)
270
+ // TODO: is this optimal ?
271
+
272
+ #define GGML_F32_VEC GGML_F32x8
273
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
274
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
275
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
276
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
277
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
278
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
279
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
280
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
281
+
282
+ // F16 AVX
283
+
284
+ #define GGML_F16_STEP 32
285
+ #define GGML_F16_EPR 8
286
+
287
+ // F16 arithmetic is not supported by AVX, so we use F32 instead
288
+
289
+ #define GGML_F32Cx8 __m256
290
+ #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
291
+ #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
292
+
293
+ #if defined(__F16C__)
294
+ // the _mm256_cvt intrinsics require F16C
295
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
296
+ #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
297
+ #else
298
+ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
299
+ float tmp[8];
300
+
301
+ for (int i = 0; i < 8; i++) {
302
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
303
+ }
304
+
305
+ return _mm256_loadu_ps(tmp);
306
+ }
307
+ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
308
+ float arr[8];
309
+
310
+ _mm256_storeu_ps(arr, y);
311
+
312
+ for (int i = 0; i < 8; i++)
313
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
314
+ }
315
+ #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
316
+ #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
317
+ #endif
318
+
319
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
320
+ #define GGML_F32Cx8_ADD _mm256_add_ps
321
+ #define GGML_F32Cx8_MUL _mm256_mul_ps
322
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
323
+
324
+ #define GGML_F16_VEC GGML_F32Cx8
325
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
326
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
327
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
328
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
329
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
330
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
331
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
332
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
333
+
334
+ #elif defined(__POWER9_VECTOR__)
335
+
336
+ #define GGML_SIMD
337
+
338
+ // F32 POWER9
339
+
340
+ #define GGML_F32_STEP 32
341
+ #define GGML_F32_EPR 4
342
+
343
+ #define GGML_F32x4 vector float
344
+ #define GGML_F32x4_ZERO {0.0f}
345
+ #define GGML_F32x4_SET1 vec_splats
346
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
347
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
348
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
349
+ #define GGML_F32x4_ADD vec_add
350
+ #define GGML_F32x4_MUL vec_mul
351
+ #define GGML_F32x4_REDUCE(res, x) \
352
+ { \
353
+ int offset = GGML_F32_ARR >> 1; \
354
+ for (int i = 0; i < offset; ++i) { \
355
+ x[i] = vec_add(x[i], x[offset+i]); \
356
+ } \
357
+ offset >>= 1; \
358
+ for (int i = 0; i < offset; ++i) { \
359
+ x[i] = vec_add(x[i], x[offset+i]); \
360
+ } \
361
+ offset >>= 1; \
362
+ for (int i = 0; i < offset; ++i) { \
363
+ x[i] = vec_add(x[i], x[offset+i]); \
364
+ } \
365
+ res = vec_extract(x[0], 0) + \
366
+ vec_extract(x[0], 1) + \
367
+ vec_extract(x[0], 2) + \
368
+ vec_extract(x[0], 3); \
369
+ }
370
+
371
+ #define GGML_F32_VEC GGML_F32x4
372
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
373
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
374
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
375
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
376
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
377
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
378
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
379
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
380
+
381
+ // F16 POWER9
382
+ #define GGML_F16_STEP GGML_F32_STEP
383
+ #define GGML_F16_EPR GGML_F32_EPR
384
+ #define GGML_F16_VEC GGML_F32x4
385
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
386
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
387
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
388
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
389
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
390
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
391
+ // Use vec_xl, not vec_ld, in case the load address is not aligned.
392
+ #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
393
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
394
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
395
+ static inline unsigned char ggml_endian_byte(int i) {
396
+ uint16_t tmp_val = 1;
397
+ return ((unsigned char *)&tmp_val)[i];
398
+ }
399
+ #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
400
+ #define GGML_F16_VEC_STORE(p, r, i) \
401
+ if (i & 0x1) \
402
+ vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
403
+ r[i - GGML_ENDIAN_BYTE(0)]), \
404
+ 0, p - GGML_F16_EPR)
405
+
406
+ #elif defined(__wasm_simd128__)
407
+
408
+ #define GGML_SIMD
409
+
410
+ // F32 WASM
411
+
412
+ #define GGML_F32_STEP 16
413
+ #define GGML_F32_EPR 4
414
+
415
+ #define GGML_F32x4 v128_t
416
+ #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
417
+ #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
418
+ #define GGML_F32x4_LOAD wasm_v128_load
419
+ #define GGML_F32x4_STORE wasm_v128_store
420
+ #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
421
+ #define GGML_F32x4_ADD wasm_f32x4_add
422
+ #define GGML_F32x4_MUL wasm_f32x4_mul
423
+ #define GGML_F32x4_REDUCE(res, x) \
424
+ { \
425
+ int offset = GGML_F32_ARR >> 1; \
426
+ for (int i = 0; i < offset; ++i) { \
427
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
428
+ } \
429
+ offset >>= 1; \
430
+ for (int i = 0; i < offset; ++i) { \
431
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
432
+ } \
433
+ offset >>= 1; \
434
+ for (int i = 0; i < offset; ++i) { \
435
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
436
+ } \
437
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
438
+ wasm_f32x4_extract_lane(x[0], 1) + \
439
+ wasm_f32x4_extract_lane(x[0], 2) + \
440
+ wasm_f32x4_extract_lane(x[0], 3); \
441
+ }
442
+
443
+ #define GGML_F32_VEC GGML_F32x4
444
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
445
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
446
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
447
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
448
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
449
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
450
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
451
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
452
+
453
+ // F16 WASM
454
+
455
+ #define GGML_F16_STEP 16
456
+ #define GGML_F16_EPR 4
457
+
458
+ inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
459
+ float tmp[4];
460
+
461
+ tmp[0] = GGML_FP16_TO_FP32(p[0]);
462
+ tmp[1] = GGML_FP16_TO_FP32(p[1]);
463
+ tmp[2] = GGML_FP16_TO_FP32(p[2]);
464
+ tmp[3] = GGML_FP16_TO_FP32(p[3]);
465
+
466
+ return wasm_v128_load(tmp);
467
+ }
468
+
469
+ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
470
+ float tmp[4];
471
+
472
+ wasm_v128_store(tmp, x);
473
+
474
+ p[0] = GGML_FP32_TO_FP16(tmp[0]);
475
+ p[1] = GGML_FP32_TO_FP16(tmp[1]);
476
+ p[2] = GGML_FP32_TO_FP16(tmp[2]);
477
+ p[3] = GGML_FP32_TO_FP16(tmp[3]);
478
+ }
479
+
480
+ #define GGML_F16x4 v128_t
481
+ #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
482
+ #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
483
+ #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
484
+ #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
485
+ #define GGML_F16x4_FMA GGML_F32x4_FMA
486
+ #define GGML_F16x4_ADD wasm_f32x4_add
487
+ #define GGML_F16x4_MUL wasm_f32x4_mul
488
+ #define GGML_F16x4_REDUCE(res, x) \
489
+ { \
490
+ int offset = GGML_F16_ARR >> 1; \
491
+ for (int i = 0; i < offset; ++i) { \
492
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
493
+ } \
494
+ offset >>= 1; \
495
+ for (int i = 0; i < offset; ++i) { \
496
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
497
+ } \
498
+ offset >>= 1; \
499
+ for (int i = 0; i < offset; ++i) { \
500
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
501
+ } \
502
+ res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
503
+ wasm_f32x4_extract_lane(x[0], 1) + \
504
+ wasm_f32x4_extract_lane(x[0], 2) + \
505
+ wasm_f32x4_extract_lane(x[0], 3)); \
506
+ }
507
+
508
+ #define GGML_F16_VEC GGML_F16x4
509
+ #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
510
+ #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
511
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
512
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
513
+ #define GGML_F16_VEC_FMA GGML_F16x4_FMA
514
+ #define GGML_F16_VEC_ADD GGML_F16x4_ADD
515
+ #define GGML_F16_VEC_MUL GGML_F16x4_MUL
516
+ #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
517
+
518
+ #elif defined(__SSE3__)
519
+
520
+ #define GGML_SIMD
521
+
522
+ // F32 SSE
523
+
524
+ #define GGML_F32_STEP 32
525
+ #define GGML_F32_EPR 4
526
+
527
+ #define GGML_F32x4 __m128
528
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
529
+ #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
530
+ #define GGML_F32x4_LOAD _mm_loadu_ps
531
+ #define GGML_F32x4_STORE _mm_storeu_ps
532
+ #if defined(__FMA__)
533
+ // TODO: Does this work?
534
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
535
+ #else
536
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
537
+ #endif
538
+ #define GGML_F32x4_ADD _mm_add_ps
539
+ #define GGML_F32x4_MUL _mm_mul_ps
540
+ #define GGML_F32x4_REDUCE(res, x) \
541
+ { \
542
+ int offset = GGML_F32_ARR >> 1; \
543
+ for (int i = 0; i < offset; ++i) { \
544
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
545
+ } \
546
+ offset >>= 1; \
547
+ for (int i = 0; i < offset; ++i) { \
548
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
549
+ } \
550
+ offset >>= 1; \
551
+ for (int i = 0; i < offset; ++i) { \
552
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
553
+ } \
554
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
555
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
556
+ }
557
+ // TODO: is this optimal ?
558
+
559
+ #define GGML_F32_VEC GGML_F32x4
560
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
561
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
562
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
563
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
564
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
565
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
566
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
567
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
568
+
569
+ // F16 SSE
570
+
571
+ #define GGML_F16_STEP 32
572
+ #define GGML_F16_EPR 4
573
+
574
+ static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
575
+ float tmp[4];
576
+
577
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
578
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
579
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
580
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
581
+
582
+ return _mm_loadu_ps(tmp);
583
+ }
584
+
585
+ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
586
+ float arr[4];
587
+
588
+ _mm_storeu_ps(arr, y);
589
+
590
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
591
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
592
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
593
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
594
+ }
595
+
596
+ #define GGML_F32Cx4 __m128
597
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
598
+ #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
599
+ #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
600
+ #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
601
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
602
+ #define GGML_F32Cx4_ADD _mm_add_ps
603
+ #define GGML_F32Cx4_MUL _mm_mul_ps
604
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
605
+
606
+ #define GGML_F16_VEC GGML_F32Cx4
607
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
608
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
609
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
610
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
611
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
612
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
613
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
614
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
615
+
616
+ #elif defined(__loongarch_asx)
617
+
618
+ #define GGML_SIMD
619
+
620
+ // F32 LASX
621
+ #define GGML_F32_STEP 32
622
+ #define GGML_F32_EPR 8
623
+
624
+ #define GGML_F32x8 __m256
625
+ #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
626
+ #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
627
+ #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
628
+ #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
629
+ #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
630
+ #define GGML_F32x8_ADD __lasx_xvfadd_s
631
+ #define GGML_F32x8_MUL __lasx_xvfmul_s
632
+ #define GGML_F32x8_REDUCE(res, x) \
633
+ do { \
634
+ int offset = GGML_F32_ARR >> 1; \
635
+ for (int i = 0; i < offset; ++i) { \
636
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
637
+ } \
638
+ offset >>= 1; \
639
+ for (int i = 0; i < offset; ++i) { \
640
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
641
+ } \
642
+ offset >>= 1; \
643
+ for (int i = 0; i < offset; ++i) { \
644
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
645
+ } \
646
+ float *tmp_p = (float *)&x[0]; \
647
+ res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
648
+ } while (0)
649
+ // TODO: is this optimal ?
650
+
651
+ #define GGML_F32_VEC GGML_F32x8
652
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
653
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
654
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
655
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
656
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
657
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
658
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
659
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
660
+
661
+ // F16 LASX
662
+
663
+ #define GGML_F16_STEP 32
664
+ #define GGML_F16_EPR 8
665
+
666
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
667
+
668
+ #define GGML_F32Cx8 __m256
669
+ #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
670
+ #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
671
+
672
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
673
+ __m256i a;
674
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
675
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
676
+ return __lasx_xvfcvtl_s_h(a);
677
+ }
678
+
679
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
680
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
681
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
682
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
683
+ }
684
+ #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
685
+ #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
686
+
687
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
688
+ #define GGML_F32Cx8_ADD __lasx_xvfadd_s
689
+ #define GGML_F32Cx8_MUL __lasx_xvfmul_s
690
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
691
+
692
+ #define GGML_F16_VEC GGML_F32Cx8
693
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
694
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
695
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
696
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
697
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
698
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
699
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
700
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
701
+
702
+ #elif defined(__loongarch_sx)
703
+
704
+ #define GGML_SIMD
705
+
706
+ // F32 LSX
707
+
708
+ #define GGML_F32_STEP 32
709
+ #define GGML_F32_EPR 4
710
+
711
+ #define GGML_F32x4 __m128
712
+ #define GGML_F32x4_ZERO __lsx_vldi(0)
713
+ #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
714
+ #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
715
+ #define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
716
+ #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
717
+ #define GGML_F32x4_ADD __lsx_vfadd_s
718
+ #define GGML_F32x4_MUL __lsx_vfmul_s
719
+ #define GGML_F32x4_REDUCE(res, x) \
720
+ { \
721
+ int offset = GGML_F32_ARR >> 1; \
722
+ for (int i = 0; i < offset; ++i) { \
723
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
724
+ } \
725
+ offset >>= 1; \
726
+ for (int i = 0; i < offset; ++i) { \
727
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
728
+ } \
729
+ offset >>= 1; \
730
+ for (int i = 0; i < offset; ++i) { \
731
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
732
+ } \
733
+ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
734
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
735
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
736
+ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
737
+ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
738
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
739
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
740
+ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
741
+ }
742
+
743
+ #define GGML_F32_VEC GGML_F32x4
744
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
745
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
746
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
747
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
748
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
749
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
750
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
751
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
752
+
753
+ // F16 LSX
754
+
755
+ #define GGML_F16_STEP 32
756
+ #define GGML_F16_EPR 4
757
+
758
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
759
+ float tmp[4];
760
+
761
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
762
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
763
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
764
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
765
+
766
+ return __lsx_vld(tmp, 0);
767
+ }
768
+
769
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
770
+ float arr[4];
771
+
772
+ __lsx_vst(y, arr, 0);
773
+
774
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
775
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
776
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
777
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
778
+ }
779
+
780
+ #define GGML_F32Cx4 __m128
781
+ #define GGML_F32Cx4_ZERO __lsx_vldi(0)
782
+ #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
783
+ #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
784
+ #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
785
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
786
+ #define GGML_F32Cx4_ADD __lsx_vfadd_s
787
+ #define GGML_F32Cx4_MUL __lsx_vfmul_s
788
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
789
+
790
+ #define GGML_F16_VEC GGML_F32Cx4
791
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
792
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
793
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
794
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
795
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
796
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
797
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
798
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
799
+
800
+ #elif defined(__VXE__) || defined(__VXE2__)
801
+
802
+ #define GGML_SIMD
803
+
804
+ // F32 s390x
805
+
806
+ #define GGML_F32_STEP 32
807
+ #define GGML_F32_EPR 4
808
+
809
+ #define GGML_F32x4 __vector float
810
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
811
+ #define GGML_F32x4_SET1 vec_splats
812
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
813
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
814
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
815
+ #define GGML_F32x4_ADD vec_add
816
+ #define GGML_F32x4_MUL vec_mul
817
+ #define GGML_F32x4_REDUCE(res, x) \
818
+ { \
819
+ int offset = GGML_F32_ARR >> 1; \
820
+ for (int i = 0; i < offset; ++i) { \
821
+ x[i] = vec_add(x[i], x[offset + i]); \
822
+ } \
823
+ offset >>= 1; \
824
+ for (int i = 0; i < offset; ++i) { \
825
+ x[i] = vec_add(x[i], x[offset + i]); \
826
+ } \
827
+ offset >>= 1; \
828
+ for (int i = 0; i < offset; ++i) { \
829
+ x[i] = vec_add(x[i], x[offset + i]); \
830
+ } \
831
+ res = vec_extract(x[0], 0) + \
832
+ vec_extract(x[0], 1) + \
833
+ vec_extract(x[0], 2) + \
834
+ vec_extract(x[0], 3); \
835
+ }
836
+
837
+ #define GGML_F32_VEC GGML_F32x4
838
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
839
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
840
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
841
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
842
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
843
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
844
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
845
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
846
+
847
+ // F16 s390x
848
+ #define GGML_F16_STEP GGML_F32_STEP
849
+ #define GGML_F16_EPR GGML_F32_EPR
850
+
851
+ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
852
+ float tmp[4];
853
+
854
+ for (int i = 0; i < 4; i++) {
855
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
856
+ }
857
+
858
+ // note: keep type-cast here to prevent compiler bugs
859
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
860
+ return vec_xl(0, (const float *)(tmp));
861
+ }
862
+
863
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
864
+ float arr[4];
865
+
866
+ // note: keep type-cast here to prevent compiler bugs
867
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
868
+ vec_xst(y, 0, (float *)(arr));
869
+
870
+ for (int i = 0; i < 4; i++) {
871
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
872
+ }
873
+ }
874
+
875
+ #define GGML_F16_VEC GGML_F32x4
876
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
877
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
878
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
879
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
880
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
881
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
882
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
883
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
884
+
885
+ #endif
886
+
887
+ // GGML_F32_ARR / GGML_F16_ARR
888
+ // number of registers to use per step
889
+ #ifdef GGML_SIMD
890
+ #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
891
+ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
892
+ #endif