whispercpp 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -3
  3. data/README.md +161 -43
  4. data/Rakefile +45 -13
  5. data/ext/.gitignore +4 -8
  6. data/ext/dependencies.rb +73 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +85 -0
  9. data/ext/ruby_whisper.c +177 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +672 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1303 -0
  15. data/ext/ruby_whisper_segment.c +220 -0
  16. data/ext/ruby_whisper_transcribe.cpp +93 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  19. data/ext/sources/CMakeLists.txt +255 -0
  20. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  21. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  22. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  23. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  24. data/ext/sources/bindings/javascript/package.json +26 -0
  25. data/ext/sources/bindings/javascript/whisper.js +19 -0
  26. data/ext/sources/build-xcframework.sh +547 -0
  27. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  28. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  29. data/ext/sources/cmake/build-info.cmake +60 -0
  30. data/ext/sources/cmake/git-vars.cmake +22 -0
  31. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  32. data/ext/sources/cmake/whisper.pc.in +10 -0
  33. data/ext/sources/examples/CMakeLists.txt +124 -0
  34. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  35. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +133 -0
  36. data/ext/sources/examples/addon.node/addon.cpp +557 -0
  37. data/ext/sources/examples/addon.node/index.js +57 -0
  38. data/ext/sources/examples/addon.node/package.json +16 -0
  39. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  40. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  41. data/ext/sources/examples/bench/bench.cpp +176 -0
  42. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  43. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  44. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  45. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  46. data/ext/sources/examples/cli/cli.cpp +1295 -0
  47. data/ext/sources/examples/coi-serviceworker.js +146 -0
  48. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  49. data/ext/sources/examples/command/command.cpp +800 -0
  50. data/ext/sources/examples/command/commands.txt +9 -0
  51. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  52. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  53. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  54. data/ext/sources/examples/common-ggml.cpp +238 -0
  55. data/ext/sources/examples/common-ggml.h +18 -0
  56. data/ext/sources/examples/common-sdl.cpp +227 -0
  57. data/ext/sources/examples/common-sdl.h +49 -0
  58. data/ext/sources/examples/common-whisper.cpp +175 -0
  59. data/ext/sources/examples/common-whisper.h +24 -0
  60. data/ext/sources/examples/common.cpp +675 -0
  61. data/ext/sources/examples/common.h +322 -0
  62. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  63. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  64. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  65. data/ext/sources/examples/generate-karaoke.sh +57 -0
  66. data/ext/sources/examples/grammar-parser.cpp +423 -0
  67. data/ext/sources/examples/grammar-parser.h +29 -0
  68. data/ext/sources/examples/helpers.js +191 -0
  69. data/ext/sources/examples/json.hpp +24596 -0
  70. data/ext/sources/examples/livestream.sh +112 -0
  71. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  72. data/ext/sources/examples/lsp/lsp.cpp +469 -0
  73. data/ext/sources/examples/lsp/whisper.vim +362 -0
  74. data/ext/sources/examples/miniaudio.h +93468 -0
  75. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  76. data/ext/sources/examples/python/whisper_processor.py +54 -0
  77. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  78. data/ext/sources/examples/quantize/quantize.cpp +226 -0
  79. data/ext/sources/examples/server/CMakeLists.txt +15 -0
  80. data/ext/sources/examples/server/bench.js +29 -0
  81. data/ext/sources/examples/server/httplib.h +10497 -0
  82. data/ext/sources/examples/server/server.cpp +1238 -0
  83. data/ext/sources/examples/server.py +115 -0
  84. data/ext/sources/examples/stb_vorbis.c +5584 -0
  85. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  86. data/ext/sources/examples/stream/stream.cpp +435 -0
  87. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  88. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  89. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  90. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  91. data/ext/sources/examples/sycl/build.sh +22 -0
  92. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  93. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  94. data/ext/sources/examples/talk-llama/CMakeLists.txt +43 -0
  95. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  96. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  97. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  98. data/ext/sources/examples/talk-llama/llama-arch.cpp +1914 -0
  99. data/ext/sources/examples/talk-llama/llama-arch.h +464 -0
  100. data/ext/sources/examples/talk-llama/llama-batch.cpp +843 -0
  101. data/ext/sources/examples/talk-llama/llama-batch.h +147 -0
  102. data/ext/sources/examples/talk-llama/llama-chat.cpp +685 -0
  103. data/ext/sources/examples/talk-llama/llama-chat.h +59 -0
  104. data/ext/sources/examples/talk-llama/llama-context.cpp +2845 -0
  105. data/ext/sources/examples/talk-llama/llama-context.h +297 -0
  106. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  107. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  108. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  109. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  110. data/ext/sources/examples/talk-llama/llama-graph.cpp +1693 -0
  111. data/ext/sources/examples/talk-llama/llama-graph.h +710 -0
  112. data/ext/sources/examples/talk-llama/llama-hparams.cpp +103 -0
  113. data/ext/sources/examples/talk-llama/llama-hparams.h +207 -0
  114. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  115. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  116. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  117. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  118. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  119. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  120. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +44 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +439 -0
  124. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  125. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  126. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  127. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  128. data/ext/sources/examples/talk-llama/llama-memory.cpp +59 -0
  129. data/ext/sources/examples/talk-llama/llama-memory.h +116 -0
  130. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  131. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  132. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1163 -0
  133. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  134. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +282 -0
  135. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  136. data/ext/sources/examples/talk-llama/llama-model.cpp +15114 -0
  137. data/ext/sources/examples/talk-llama/llama-model.h +452 -0
  138. data/ext/sources/examples/talk-llama/llama-quant.cpp +1049 -0
  139. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  140. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  141. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  142. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3377 -0
  143. data/ext/sources/examples/talk-llama/llama-vocab.h +132 -0
  144. data/ext/sources/examples/talk-llama/llama.cpp +358 -0
  145. data/ext/sources/examples/talk-llama/llama.h +1484 -0
  146. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  147. data/ext/sources/examples/talk-llama/speak +40 -0
  148. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  149. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  150. data/ext/sources/examples/talk-llama/talk-llama.cpp +810 -0
  151. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  152. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  153. data/ext/sources/examples/talk-llama/unicode.cpp +854 -0
  154. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  155. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  156. data/ext/sources/examples/vad-speech-segments/speech.cpp +149 -0
  157. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  158. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  159. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  160. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  161. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  162. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  163. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  164. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  165. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +251 -0
  166. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  167. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  168. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  169. data/ext/sources/ggml/CMakeLists.txt +435 -0
  170. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  171. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  172. data/ext/sources/ggml/cmake/common.cmake +50 -0
  173. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  174. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +10 -8
  176. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +11 -1
  178. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  179. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  180. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  181. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  182. data/ext/{ggml → sources/ggml}/include/ggml.h +325 -269
  183. data/ext/sources/ggml/include/gguf.h +202 -0
  184. data/ext/sources/ggml/src/CMakeLists.txt +404 -0
  185. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  186. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  187. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  188. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +92 -53
  189. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +69 -34
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  191. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +75 -0
  192. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  195. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  196. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  197. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +140 -1
  198. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +588 -146
  199. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  200. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  201. data/ext/{ggml → sources/ggml}/src/ggml-common.h +16 -8
  202. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +597 -0
  203. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +3 -2
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +11 -10
  205. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  208. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  209. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  210. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  211. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  212. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  213. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  214. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  215. data/ext/{ggml/src/ggml-cpu/cpu-feats-x86.cpp → sources/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp} +5 -1
  216. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  217. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +3285 -0
  218. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  219. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  220. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  221. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  222. data/ext/sources/ggml/src/ggml-cpu/common.h +73 -0
  223. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +172 -41
  224. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3551 -0
  225. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +78 -25
  226. data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.cpp → sources/ggml/src/ggml-cpu/hbm.cpp} +1 -1
  227. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  228. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  229. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  230. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  231. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3594 -0
  232. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +19 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +9786 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ops.h +118 -0
  235. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  236. data/ext/{ggml/src/ggml-cpu/ggml-cpu-quants.h → sources/ggml/src/ggml-cpu/quants.h} +26 -0
  237. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  238. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  239. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +1184 -0
  240. data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.cpp → sources/ggml/src/ggml-cpu/traits.cpp} +1 -1
  241. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  242. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  243. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +345 -0
  244. data/ext/sources/ggml/src/ggml-cpu/vec.h +1027 -0
  245. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  246. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  247. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  248. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  249. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  250. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  251. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  252. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  253. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  254. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  255. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  256. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  257. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/common.cuh +851 -0
  259. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  260. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  262. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  264. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  266. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  267. data/ext/sources/ggml/src/ggml-cuda/convert.cu +752 -0
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +31 -0
  269. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  270. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  273. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  276. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  277. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  278. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1474 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +638 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  291. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  292. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  293. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3647 -0
  294. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  295. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  296. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  297. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  298. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  299. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  301. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +506 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +11 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  308. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  309. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  310. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  312. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  313. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  314. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  318. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  319. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  320. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  321. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  322. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  323. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  324. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  325. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  326. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +155 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  330. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +26 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  458. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/unary.cu +378 -0
  460. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +66 -0
  461. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  462. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  463. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  464. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  465. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  466. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  467. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  468. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  469. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +135 -0
  470. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +147 -158
  471. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  509. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +121 -0
  510. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +649 -0
  511. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2504 -1108
  512. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +2102 -1463
  513. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  514. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  515. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  516. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +110 -0
  517. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +6494 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  557. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  558. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  559. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  560. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  561. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  562. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  563. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  564. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  565. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  566. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  567. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  568. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  569. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +120 -128
  570. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  571. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +494 -84
  572. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  573. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  574. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +344 -0
  575. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  576. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  577. data/ext/sources/ggml/src/ggml-sycl/common.hpp +561 -0
  578. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +56 -70
  579. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  580. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +8 -12
  581. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  582. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +575 -0
  583. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  584. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +839 -0
  585. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  586. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +823 -0
  587. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +188 -67
  588. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  589. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2987 -0
  590. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1120 -0
  591. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +84 -0
  592. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +102 -0
  593. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +212 -0
  594. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  595. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1197 -1295
  596. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  597. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  598. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  599. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  600. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +60 -81
  601. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  602. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1065 -0
  603. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  604. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +482 -0
  605. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  606. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  607. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  608. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  609. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +111 -0
  610. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +472 -0
  611. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  612. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +38 -28
  613. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  614. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +15 -0
  615. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +26 -0
  616. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +6 -11
  617. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  618. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1307 -0
  619. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +289 -0
  620. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +200 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  623. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3822 -1335
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +61 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  740. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +203 -36
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  743. data/ext/{ggml → sources/ggml}/src/ggml.c +918 -1782
  744. data/ext/sources/ggml/src/ggml.cpp +26 -0
  745. data/ext/sources/ggml/src/gguf.cpp +1351 -0
  746. data/ext/{include → sources/include}/whisper.h +70 -2
  747. data/ext/sources/src/CMakeLists.txt +145 -0
  748. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  749. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  750. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  751. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +36 -10
  752. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  753. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +29 -3
  754. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  755. data/ext/sources/src/whisper-arch.h +197 -0
  756. data/ext/{src → sources/src}/whisper.cpp +1966 -386
  757. data/ext/sources/tests/CMakeLists.txt +105 -0
  758. data/ext/sources/tests/earnings21/eval.mk +58 -0
  759. data/ext/sources/tests/earnings21/eval.py +68 -0
  760. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  761. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  762. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  763. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  764. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  765. data/ext/sources/tests/en-0-ref.txt +1 -0
  766. data/ext/sources/tests/en-1-ref.txt +1 -0
  767. data/ext/sources/tests/en-2-ref.txt +1 -0
  768. data/ext/sources/tests/es-0-ref.txt +1 -0
  769. data/ext/sources/tests/librispeech/eval.mk +39 -0
  770. data/ext/sources/tests/librispeech/eval.py +47 -0
  771. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  772. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  773. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  774. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  775. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  776. data/ext/sources/tests/run-tests.sh +130 -0
  777. data/ext/sources/tests/test-c.c +3 -0
  778. data/ext/sources/tests/test-vad-full.cpp +54 -0
  779. data/ext/sources/tests/test-vad.cpp +83 -0
  780. data/ext/sources/tests/test-whisper.js +58 -0
  781. data/extsources.rb +39 -5
  782. data/lib/whisper/context.rb +15 -0
  783. data/lib/whisper/model/uri.rb +202 -126
  784. data/lib/whisper/segment.rb +58 -0
  785. data/sig/whisper.rbs +510 -0
  786. data/test/helper.rb +24 -0
  787. data/{tests → test}/test_callback.rb +45 -3
  788. data/{tests → test}/test_error.rb +2 -2
  789. data/{tests → test}/test_model.rb +47 -0
  790. data/test/test_package.rb +51 -0
  791. data/test/test_params.rb +297 -0
  792. data/test/test_segment.rb +146 -0
  793. data/test/test_vad.rb +19 -0
  794. data/test/test_vad_params.rb +103 -0
  795. data/{tests → test}/test_whisper.rb +106 -36
  796. data/whispercpp.gemspec +5 -5
  797. metadata +837 -134
  798. data/ext/cpu.mk +0 -9
  799. data/ext/examples/dr_wav.h +0 -8815
  800. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  801. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  802. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  803. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -10835
  804. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  805. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  806. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  807. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  808. data/ext/ggml/src/ggml-sycl/convert.cpp +0 -547
  809. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  810. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  811. data/ext/ggml/src/ggml-sycl/mmvq.cpp +0 -1015
  812. data/ext/ggml/src/ggml-sycl/norm.cpp +0 -378
  813. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  814. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  815. data/ext/metal-embed.mk +0 -17
  816. data/ext/metal.mk +0 -6
  817. data/ext/ruby_whisper.cpp +0 -1909
  818. data/ext/scripts/get-flags.mk +0 -38
  819. data/lib/whisper.rb +0 -2
  820. data/tests/helper.rb +0 -7
  821. data/tests/test_package.rb +0 -31
  822. data/tests/test_params.rb +0 -160
  823. data/tests/test_segment.rb +0 -83
  824. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  825. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  826. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  827. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  828. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  829. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  830. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  831. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  832. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  833. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  834. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  835. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  836. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  837. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  838. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  839. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  840. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  841. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  842. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  843. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  844. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  845. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  846. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.h → sources/ggml/src/ggml-cpu/hbm.h} +0 -0
  847. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.h → sources/ggml/src/ggml-cpu/traits.h} +0 -0
  848. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  849. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  850. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  851. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  852. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  853. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  854. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
  855. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  856. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  857. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
@@ -0,0 +1,1184 @@
1
+ #pragma once
2
+
3
+ #include "ggml-cpu-impl.h"
4
+
5
+ #ifdef __ARM_FEATURE_SVE
6
+ #include <arm_sve.h>
7
+ #endif // __ARM_FEATURE_SVE
8
+
9
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
+ //
12
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
+ //
14
+ #include <arm_neon.h>
15
+ #endif
16
+
17
+ #if defined(__F16C__)
18
+ #include <immintrin.h>
19
+ #endif
20
+
21
+ #ifdef __cplusplus
22
+ extern "C" {
23
+ #endif
24
+
25
+ //
26
+ // simd mappings
27
+ //
28
+
29
+ // FP16 to FP32 conversion
30
+
31
+ // 16-bit float
32
+ // on Arm, we use __fp16
33
+ // on x86, we use uint16_t
34
+ //
35
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
+ //
38
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
+
42
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
+
44
+ static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45
+ __fp16 tmp;
46
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47
+ return (float)tmp;
48
+ }
49
+
50
+ static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
+ ggml_fp16_t res;
52
+ __fp16 tmp = f;
53
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54
+ return res;
55
+ }
56
+ #elif defined(__F16C__)
57
+ #ifdef _MSC_VER
58
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
+ #else
61
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
+ #endif
64
+ #elif defined(__POWER9_VECTOR__)
65
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
+ /* the inline asm below is about 12% faster than the lookup method */
68
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
+
71
+ static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72
+ float f;
73
+ double d;
74
+ __asm__(
75
+ "mtfprd %0,%2\n"
76
+ "xscvhpdp %0,%0\n"
77
+ "frsp %1,%0\n" :
78
+ /* temp */ "=d"(d),
79
+ /* out */ "=f"(f):
80
+ /* in */ "r"(h));
81
+ return f;
82
+ }
83
+
84
+ static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
+ double d;
86
+ ggml_fp16_t r;
87
+ __asm__( /* xscvdphp can work on double or single precision */
88
+ "xscvdphp %0,%2\n"
89
+ "mffprd %1,%0\n" :
90
+ /* temp */ "=d"(d),
91
+ /* out */ "=r"(r):
92
+ /* in */ "f"(f));
93
+ return r;
94
+ }
95
+ #elif defined(__riscv) && defined(__riscv_zfhmin)
96
+ static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
+ float f;
98
+ __asm__(
99
+ "fmv.h.x %[f], %[h]\n\t"
100
+ "fcvt.s.h %[f], %[f]"
101
+ : [f] "=&f" (f)
102
+ : [h] "r" (h)
103
+ );
104
+ return f;
105
+ }
106
+
107
+ static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
+ ggml_fp16_t res;
109
+ __asm__(
110
+ "fcvt.h.s %[f], %[f]\n\t"
111
+ "fmv.x.h %[h], %[f]"
112
+ : [h] "=&r" (res)
113
+ : [f] "f" (f)
114
+ );
115
+ return res;
116
+ }
117
+
118
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
119
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
120
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
121
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
122
+ #elif defined(__NNPA__)
123
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
124
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
125
+
126
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
127
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
128
+
129
+ static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
130
+ uint16x8_t v_h = vec_splats(h);
131
+ uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
132
+ return vec_extend_to_fp32_hi(v_hd, 0)[0];
133
+ }
134
+
135
+ static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
136
+ float32x4_t v_f = vec_splats(f);
137
+ float32x4_t v_zero = vec_splats(0.0f);
138
+ uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
139
+ uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
140
+ return vec_extract(v_h, 0);
141
+ }
142
+ #endif
143
+
144
+ // precomputed f32 table for f16 (256 KB)
145
+ // defined in ggml-cpu.c, initialized in ggml_cpu_init()
146
+ extern float ggml_table_f32_f16[1 << 16];
147
+
148
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
149
+ // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
150
+ // This is also true for POWER9.
151
+ #if !defined(GGML_CPU_FP16_TO_FP32)
152
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
153
+ uint16_t s;
154
+ memcpy(&s, &f, sizeof(uint16_t));
155
+ return ggml_table_f32_f16[s];
156
+ }
157
+
158
+ #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
159
+ #endif
160
+
161
+ #if !defined(GGML_CPU_FP32_TO_FP16)
162
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
163
+ #endif
164
+
165
+
166
+ // we define a common set of C macros which map to specific intrinsics based on the current architecture
167
+ // we then implement the fundamental computation operations below using only these macros
168
+ // adding support for new architectures requires to define the corresponding SIMD macros
169
+ //
170
+ // GGML_F32_STEP / GGML_F16_STEP
171
+ // number of elements to process in a single step
172
+ //
173
+ // GGML_F32_EPR / GGML_F16_EPR
174
+ // number of elements to fit in a single register
175
+ //
176
+
177
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
178
+
179
+ #define GGML_SIMD
180
+
181
+ // F32 SVE
182
+ #define GGML_F32_EPR 8
183
+ #define DEFAULT_PG svptrue_b32()
184
+
185
+ #define GGML_F32xt svfloat32_t
186
+ #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
187
+ #define GGML_F32xt_SET1(x) svdup_n_f32(x)
188
+ #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
189
+ #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
190
+ #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
191
+ #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
192
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
193
+ #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
194
+ #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
195
+ #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
196
+ #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
197
+ #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
198
+ #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
199
+ #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
200
+ #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
201
+ { \
202
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
203
+ sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
204
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
205
+ sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
206
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
207
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
208
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
209
+ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
210
+ }
211
+ #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
212
+
213
+ #define GGML_F32_VEC GGML_F32xt
214
+ #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
215
+ #define GGML_F32_VEC_SET1 GGML_F32xt_SET1
216
+ #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
217
+ #define GGML_F32_VEC_STORE GGML_F32xt_STORE
218
+ #define GGML_F32_VEC_FMA GGML_F32xt_FMA
219
+ #define GGML_F32_VEC_ADD GGML_F32xt_ADD
220
+ #define GGML_F32_VEC_MUL GGML_F32xt_MUL
221
+ #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
222
+
223
+ // F16 NEON
224
+
225
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
226
+ #define GGML_F16_STEP 32
227
+ #define GGML_F16_EPR 8
228
+
229
+ #define GGML_F16x8 float16x8_t
230
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
231
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
232
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
233
+ #define GGML_F16x8_STORE vst1q_f16
234
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
235
+ #define GGML_F16x8_ADD vaddq_f16
236
+ #define GGML_F16x8_MUL vmulq_f16
237
+ #define GGML_F16x8_REDUCE(res, x) \
238
+ do { \
239
+ int offset = GGML_F16_ARR >> 1; \
240
+ for (int i = 0; i < offset; ++i) { \
241
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
242
+ } \
243
+ offset >>= 1; \
244
+ for (int i = 0; i < offset; ++i) { \
245
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
246
+ } \
247
+ offset >>= 1; \
248
+ for (int i = 0; i < offset; ++i) { \
249
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
250
+ } \
251
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
252
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
253
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
254
+ } while (0)
255
+
256
+ #define GGML_F16_VEC GGML_F16x8
257
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
258
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
259
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
260
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
261
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
262
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
263
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
264
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
265
+ #else
266
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
267
+ // and take advantage of the vcvt_ functions to convert to/from FP16
268
+
269
+ #define GGML_F16_STEP 16
270
+ #define GGML_F16_EPR 4
271
+
272
+ #define GGML_F32Cx4 float32x4_t
273
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
274
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
275
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
276
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
277
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
278
+ #define GGML_F32Cx4_ADD vaddq_f32
279
+ #define GGML_F32Cx4_MUL vmulq_f32
280
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
281
+
282
+ #define GGML_F16_VEC GGML_F32Cx4
283
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
284
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
285
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
286
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
287
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
288
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
289
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
290
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
291
+ #endif
292
+
293
+ #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
294
+
295
+ #define GGML_SIMD
296
+
297
+ // F32 NEON
298
+
299
+ #define GGML_F32_STEP 16
300
+ #define GGML_F32_EPR 4
301
+
302
+ #define GGML_F32x4 float32x4_t
303
+ #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
304
+ #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
305
+ #define GGML_F32x4_LOAD vld1q_f32
306
+ #define GGML_F32x4_STORE vst1q_f32
307
+ #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
308
+ #define GGML_F32x4_ADD vaddq_f32
309
+ #define GGML_F32x4_MUL vmulq_f32
310
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
311
+ #define GGML_F32x4_REDUCE(res, x) \
312
+ { \
313
+ int offset = GGML_F32_ARR >> 1; \
314
+ for (int i = 0; i < offset; ++i) { \
315
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
316
+ } \
317
+ offset >>= 1; \
318
+ for (int i = 0; i < offset; ++i) { \
319
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
320
+ } \
321
+ offset >>= 1; \
322
+ for (int i = 0; i < offset; ++i) { \
323
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
324
+ } \
325
+ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
326
+ }
327
+
328
+ #define GGML_F32_VEC GGML_F32x4
329
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
330
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
331
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
332
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
333
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
334
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
335
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
336
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
337
+
338
+ // F16 NEON
339
+
340
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
341
+ #define GGML_F16_STEP 32
342
+ #define GGML_F16_EPR 8
343
+
344
+ #define GGML_F16x8 float16x8_t
345
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
346
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
347
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
348
+ #define GGML_F16x8_STORE vst1q_f16
349
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
350
+ #define GGML_F16x8_ADD vaddq_f16
351
+ #define GGML_F16x8_MUL vmulq_f16
352
+ #define GGML_F16x8_REDUCE(res, x) \
353
+ do { \
354
+ int offset = GGML_F16_ARR >> 1; \
355
+ for (int i = 0; i < offset; ++i) { \
356
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
357
+ } \
358
+ offset >>= 1; \
359
+ for (int i = 0; i < offset; ++i) { \
360
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
361
+ } \
362
+ offset >>= 1; \
363
+ for (int i = 0; i < offset; ++i) { \
364
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
365
+ } \
366
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
367
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
368
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
369
+ } while (0)
370
+
371
+ #define GGML_F16_VEC GGML_F16x8
372
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
373
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
374
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
375
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
376
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
377
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
378
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
379
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
380
+ #else
381
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
382
+ // and take advantage of the vcvt_ functions to convert to/from FP16
383
+
384
+ #define GGML_F16_STEP 16
385
+ #define GGML_F16_EPR 4
386
+
387
+ #define GGML_F32Cx4 float32x4_t
388
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
389
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
390
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
391
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
392
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
393
+ #define GGML_F32Cx4_ADD vaddq_f32
394
+ #define GGML_F32Cx4_MUL vmulq_f32
395
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
396
+
397
+ #define GGML_F16_VEC GGML_F32Cx4
398
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
399
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
400
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
401
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
402
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
403
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
404
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
405
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
406
+ #endif
407
+
408
+ #elif defined(__AVX512F__)
409
+
410
+ #define GGML_SIMD
411
+
412
+ // F32 AVX512
413
+
414
+ #define GGML_F32_STEP 64
415
+ #define GGML_F32_EPR 16
416
+
417
+ #define GGML_F32x16 __m512
418
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
419
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
420
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
421
+ #define GGML_F32x16_STORE _mm512_storeu_ps
422
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
423
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
424
+ #define GGML_F32x16_ADD _mm512_add_ps
425
+ #define GGML_F32x16_MUL _mm512_mul_ps
426
+ #define GGML_F32x16_REDUCE(res, x) \
427
+ do { \
428
+ int offset = GGML_F32_ARR >> 1; \
429
+ for (int i = 0; i < offset; ++i) { \
430
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
431
+ } \
432
+ offset >>= 1; \
433
+ for (int i = 0; i < offset; ++i) { \
434
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
435
+ } \
436
+ offset >>= 1; \
437
+ for (int i = 0; i < offset; ++i) { \
438
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
439
+ } \
440
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
441
+ } while (0)
442
+
443
+ // TODO: is this optimal ?
444
+
445
+ #define GGML_F32_VEC GGML_F32x16
446
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
447
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
448
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
449
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
450
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
451
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
452
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
453
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
454
+
455
+ // F16 AVX512
456
+
457
+ // F16 AVX
458
+
459
+ #define GGML_F16_STEP 64
460
+ #define GGML_F16_EPR 16
461
+
462
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
463
+
464
+ #define GGML_F32Cx16 __m512
465
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
466
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
467
+
468
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
469
+ // so F16C guard isn't required
470
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
471
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
472
+
473
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
474
+ #define GGML_F32Cx16_ADD _mm512_add_ps
475
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
476
+ #define GGML_F32Cx16_REDUCE(res, x) \
477
+ do { \
478
+ int offset = GGML_F32_ARR >> 1; \
479
+ for (int i = 0; i < offset; ++i) { \
480
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
481
+ } \
482
+ offset >>= 1; \
483
+ for (int i = 0; i < offset; ++i) { \
484
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
485
+ } \
486
+ offset >>= 1; \
487
+ for (int i = 0; i < offset; ++i) { \
488
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
489
+ } \
490
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
491
+ } while (0)
492
+
493
+ #define GGML_F16_VEC GGML_F32Cx16
494
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
495
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
496
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
497
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
498
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
499
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
500
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
501
+
502
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
503
+ #elif defined(__AVX__)
504
+
505
+ #define GGML_SIMD
506
+
507
+ // F32 AVX
508
+
509
+ #define GGML_F32_STEP 32
510
+ #define GGML_F32_EPR 8
511
+
512
+ #define GGML_F32x8 __m256
513
+ #define GGML_F32x8_ZERO _mm256_setzero_ps()
514
+ #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
515
+ #define GGML_F32x8_LOAD _mm256_loadu_ps
516
+ #define GGML_F32x8_STORE _mm256_storeu_ps
517
+ #if defined(__FMA__)
518
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
519
+ #else
520
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
521
+ #endif
522
+ #define GGML_F32x8_ADD _mm256_add_ps
523
+ #define GGML_F32x8_MUL _mm256_mul_ps
524
+ #define GGML_F32x8_REDUCE(res, x) \
525
+ do { \
526
+ int offset = GGML_F32_ARR >> 1; \
527
+ for (int i = 0; i < offset; ++i) { \
528
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
529
+ } \
530
+ offset >>= 1; \
531
+ for (int i = 0; i < offset; ++i) { \
532
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
533
+ } \
534
+ offset >>= 1; \
535
+ for (int i = 0; i < offset; ++i) { \
536
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
537
+ } \
538
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
539
+ _mm256_extractf128_ps(x[0], 1)); \
540
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
541
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
542
+ } while (0)
543
+ // TODO: is this optimal ?
544
+
545
+ #define GGML_F32_VEC GGML_F32x8
546
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
547
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
548
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
549
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
550
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
551
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
552
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
553
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
554
+
555
+ // F16 AVX
556
+
557
+ #define GGML_F16_STEP 32
558
+ #define GGML_F16_EPR 8
559
+
560
+ // F16 arithmetic is not supported by AVX, so we use F32 instead
561
+
562
+ #define GGML_F32Cx8 __m256
563
+ #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
564
+ #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
565
+
566
+ #if defined(__F16C__)
567
+ // the _mm256_cvt intrinsics require F16C
568
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
569
+ #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
570
+ #else
571
+ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
572
+ float tmp[8];
573
+
574
+ for (int i = 0; i < 8; i++) {
575
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
576
+ }
577
+
578
+ return _mm256_loadu_ps(tmp);
579
+ }
580
+ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
581
+ float arr[8];
582
+
583
+ _mm256_storeu_ps(arr, y);
584
+
585
+ for (int i = 0; i < 8; i++)
586
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
587
+ }
588
+ #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
589
+ #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
590
+ #endif
591
+
592
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
593
+ #define GGML_F32Cx8_ADD _mm256_add_ps
594
+ #define GGML_F32Cx8_MUL _mm256_mul_ps
595
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
596
+
597
+ #define GGML_F16_VEC GGML_F32Cx8
598
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
599
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
600
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
601
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
602
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
603
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
604
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
605
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
606
+
607
+ #elif defined(__POWER9_VECTOR__)
608
+
609
+ #define GGML_SIMD
610
+
611
+ // F32 POWER9
612
+
613
+ #define GGML_F32_STEP 32
614
+ #define GGML_F32_EPR 4
615
+
616
+ #define GGML_F32x4 vector float
617
+ #define GGML_F32x4_ZERO {0.0f}
618
+ #define GGML_F32x4_SET1 vec_splats
619
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
620
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
621
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
622
+ #define GGML_F32x4_ADD vec_add
623
+ #define GGML_F32x4_MUL vec_mul
624
+ #define GGML_F32x4_REDUCE(res, x) \
625
+ { \
626
+ int offset = GGML_F32_ARR >> 1; \
627
+ for (int i = 0; i < offset; ++i) { \
628
+ x[i] = vec_add(x[i], x[offset+i]); \
629
+ } \
630
+ offset >>= 1; \
631
+ for (int i = 0; i < offset; ++i) { \
632
+ x[i] = vec_add(x[i], x[offset+i]); \
633
+ } \
634
+ offset >>= 1; \
635
+ for (int i = 0; i < offset; ++i) { \
636
+ x[i] = vec_add(x[i], x[offset+i]); \
637
+ } \
638
+ res = vec_extract(x[0], 0) + \
639
+ vec_extract(x[0], 1) + \
640
+ vec_extract(x[0], 2) + \
641
+ vec_extract(x[0], 3); \
642
+ }
643
+
644
+ #define GGML_F32_VEC GGML_F32x4
645
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
646
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
647
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
648
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
649
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
650
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
651
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
652
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
653
+
654
+ // F16 POWER9
655
+ #define GGML_F16_STEP GGML_F32_STEP
656
+ #define GGML_F16_EPR GGML_F32_EPR
657
+ #define GGML_F16_VEC GGML_F32x4
658
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
659
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
660
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
661
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
662
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
663
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
664
+ // Use vec_xl, not vec_ld, in case the load address is not aligned.
665
+ #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
666
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
667
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
668
+ static inline unsigned char ggml_endian_byte(int i) {
669
+ uint16_t tmp_val = 1;
670
+ return ((unsigned char *)&tmp_val)[i];
671
+ }
672
+ #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
673
+ #define GGML_F16_VEC_STORE(p, r, i) \
674
+ if (i & 0x1) \
675
+ vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
676
+ r[i - GGML_ENDIAN_BYTE(0)]), \
677
+ 0, p - GGML_F16_EPR)
678
+
679
+ #elif defined(__wasm_simd128__)
680
+
681
+ #define GGML_SIMD
682
+
683
+ // F32 WASM
684
+
685
+ #define GGML_F32_STEP 16
686
+ #define GGML_F32_EPR 4
687
+
688
+ #define GGML_F32x4 v128_t
689
+ #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
690
+ #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
691
+ #define GGML_F32x4_LOAD wasm_v128_load
692
+ #define GGML_F32x4_STORE wasm_v128_store
693
+ #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
694
+ #define GGML_F32x4_ADD wasm_f32x4_add
695
+ #define GGML_F32x4_MUL wasm_f32x4_mul
696
+ #define GGML_F32x4_REDUCE(res, x) \
697
+ { \
698
+ int offset = GGML_F32_ARR >> 1; \
699
+ for (int i = 0; i < offset; ++i) { \
700
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
701
+ } \
702
+ offset >>= 1; \
703
+ for (int i = 0; i < offset; ++i) { \
704
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
705
+ } \
706
+ offset >>= 1; \
707
+ for (int i = 0; i < offset; ++i) { \
708
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
709
+ } \
710
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
711
+ wasm_f32x4_extract_lane(x[0], 1) + \
712
+ wasm_f32x4_extract_lane(x[0], 2) + \
713
+ wasm_f32x4_extract_lane(x[0], 3); \
714
+ }
715
+
716
+ #define GGML_F32_VEC GGML_F32x4
717
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
718
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
719
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
720
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
721
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
722
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
723
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
724
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
725
+
726
+ // F16 WASM
727
+
728
+ #define GGML_F16_STEP 16
729
+ #define GGML_F16_EPR 4
730
+
731
+ inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
732
+ float tmp[4];
733
+
734
+ tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
735
+ tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
736
+ tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
737
+ tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
738
+
739
+ return wasm_v128_load(tmp);
740
+ }
741
+
742
+ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
743
+ float tmp[4];
744
+
745
+ wasm_v128_store(tmp, x);
746
+
747
+ p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
748
+ p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
749
+ p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
750
+ p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
751
+ }
752
+
753
+ #define GGML_F16x4 v128_t
754
+ #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
755
+ #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
756
+ #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
757
+ #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
758
+ #define GGML_F16x4_FMA GGML_F32x4_FMA
759
+ #define GGML_F16x4_ADD wasm_f32x4_add
760
+ #define GGML_F16x4_MUL wasm_f32x4_mul
761
+ #define GGML_F16x4_REDUCE(res, x) \
762
+ { \
763
+ int offset = GGML_F16_ARR >> 1; \
764
+ for (int i = 0; i < offset; ++i) { \
765
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
766
+ } \
767
+ offset >>= 1; \
768
+ for (int i = 0; i < offset; ++i) { \
769
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
770
+ } \
771
+ offset >>= 1; \
772
+ for (int i = 0; i < offset; ++i) { \
773
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
774
+ } \
775
+ res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
776
+ wasm_f32x4_extract_lane(x[0], 1) + \
777
+ wasm_f32x4_extract_lane(x[0], 2) + \
778
+ wasm_f32x4_extract_lane(x[0], 3)); \
779
+ }
780
+
781
+ #define GGML_F16_VEC GGML_F16x4
782
+ #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
783
+ #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
784
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
785
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
786
+ #define GGML_F16_VEC_FMA GGML_F16x4_FMA
787
+ #define GGML_F16_VEC_ADD GGML_F16x4_ADD
788
+ #define GGML_F16_VEC_MUL GGML_F16x4_MUL
789
+ #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
790
+
791
+ #elif defined(__SSE3__)
792
+
793
+ #define GGML_SIMD
794
+
795
+ // F32 SSE
796
+
797
+ #define GGML_F32_STEP 32
798
+ #define GGML_F32_EPR 4
799
+
800
+ #define GGML_F32x4 __m128
801
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
802
+ #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
803
+ #define GGML_F32x4_LOAD _mm_loadu_ps
804
+ #define GGML_F32x4_STORE _mm_storeu_ps
805
+ #if defined(__FMA__)
806
+ // TODO: Does this work?
807
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
808
+ #else
809
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
810
+ #endif
811
+ #define GGML_F32x4_ADD _mm_add_ps
812
+ #define GGML_F32x4_MUL _mm_mul_ps
813
+ #define GGML_F32x4_REDUCE(res, x) \
814
+ { \
815
+ int offset = GGML_F32_ARR >> 1; \
816
+ for (int i = 0; i < offset; ++i) { \
817
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
818
+ } \
819
+ offset >>= 1; \
820
+ for (int i = 0; i < offset; ++i) { \
821
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
822
+ } \
823
+ offset >>= 1; \
824
+ for (int i = 0; i < offset; ++i) { \
825
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
826
+ } \
827
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
828
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
829
+ }
830
+ // TODO: is this optimal ?
831
+
832
+ #define GGML_F32_VEC GGML_F32x4
833
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
834
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
835
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
836
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
837
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
838
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
839
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
840
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
841
+
842
+ // F16 SSE
843
+
844
+ #define GGML_F16_STEP 32
845
+ #define GGML_F16_EPR 4
846
+
847
+ static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
848
+ float tmp[4];
849
+
850
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
851
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
852
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
853
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
854
+
855
+ return _mm_loadu_ps(tmp);
856
+ }
857
+
858
+ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
859
+ float arr[4];
860
+
861
+ _mm_storeu_ps(arr, y);
862
+
863
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
864
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
865
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
866
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
867
+ }
868
+
869
+ #define GGML_F32Cx4 __m128
870
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
871
+ #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
872
+ #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
873
+ #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
874
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
875
+ #define GGML_F32Cx4_ADD _mm_add_ps
876
+ #define GGML_F32Cx4_MUL _mm_mul_ps
877
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
878
+
879
+ #define GGML_F16_VEC GGML_F32Cx4
880
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
881
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
882
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
883
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
884
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
885
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
886
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
887
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
888
+
889
+ #elif defined(__loongarch_asx)
890
+
891
+ #define GGML_SIMD
892
+
893
+ // F32 LASX
894
+ #define GGML_F32_STEP 32
895
+ #define GGML_F32_EPR 8
896
+
897
+ #define GGML_F32x8 __m256
898
+ #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
899
+ #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
900
+ #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
901
+ #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
902
+ #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
903
+ #define GGML_F32x8_ADD __lasx_xvfadd_s
904
+ #define GGML_F32x8_MUL __lasx_xvfmul_s
905
+ #define GGML_F32x8_REDUCE(res, x) \
906
+ do { \
907
+ int offset = GGML_F32_ARR >> 1; \
908
+ for (int i = 0; i < offset; ++i) { \
909
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
910
+ } \
911
+ offset >>= 1; \
912
+ for (int i = 0; i < offset; ++i) { \
913
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
914
+ } \
915
+ offset >>= 1; \
916
+ for (int i = 0; i < offset; ++i) { \
917
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
918
+ } \
919
+ float *tmp_p = (float *)&x[0]; \
920
+ res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
921
+ } while (0)
922
+ // TODO: is this optimal ?
923
+
924
+ #define GGML_F32_VEC GGML_F32x8
925
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
926
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
927
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
928
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
929
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
930
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
931
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
932
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
933
+
934
+ // F16 LASX
935
+
936
+ #define GGML_F16_STEP 32
937
+ #define GGML_F16_EPR 8
938
+
939
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
940
+
941
+ #define GGML_F32Cx8 __m256
942
+ #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
943
+ #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
944
+
945
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
946
+ __m256i a;
947
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
948
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
949
+ return __lasx_xvfcvtl_s_h(a);
950
+ }
951
+
952
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
953
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
954
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
955
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
956
+ }
957
+ #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
958
+ #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
959
+
960
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
961
+ #define GGML_F32Cx8_ADD __lasx_xvfadd_s
962
+ #define GGML_F32Cx8_MUL __lasx_xvfmul_s
963
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
964
+
965
+ #define GGML_F16_VEC GGML_F32Cx8
966
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
967
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
968
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
969
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
970
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
971
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
972
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
973
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
974
+
975
+ #elif defined(__loongarch_sx)
976
+
977
+ #define GGML_SIMD
978
+
979
+ // F32 LSX
980
+
981
+ #define GGML_F32_STEP 32
982
+ #define GGML_F32_EPR 4
983
+
984
+ #define GGML_F32x4 __m128
985
+ #define GGML_F32x4_ZERO __lsx_vldi(0)
986
+ #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
987
+ #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
988
+ #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
989
+ #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
990
+ #define GGML_F32x4_ADD __lsx_vfadd_s
991
+ #define GGML_F32x4_MUL __lsx_vfmul_s
992
+ #define GGML_F32x4_REDUCE(res, x) \
993
+ { \
994
+ int offset = GGML_F32_ARR >> 1; \
995
+ for (int i = 0; i < offset; ++i) { \
996
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
997
+ } \
998
+ offset >>= 1; \
999
+ for (int i = 0; i < offset; ++i) { \
1000
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1001
+ } \
1002
+ offset >>= 1; \
1003
+ for (int i = 0; i < offset; ++i) { \
1004
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1005
+ } \
1006
+ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1007
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1008
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1009
+ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1010
+ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1011
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1012
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1013
+ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1014
+ }
1015
+
1016
+ #define GGML_F32_VEC GGML_F32x4
1017
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1018
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1019
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1020
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1021
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1022
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1023
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1024
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1025
+
1026
+ // F16 LSX
1027
+
1028
+ #define GGML_F16_STEP 32
1029
+ #define GGML_F16_EPR 4
1030
+
1031
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1032
+ float tmp[4];
1033
+
1034
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1035
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1036
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1037
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1038
+
1039
+ return __lsx_vld(tmp, 0);
1040
+ }
1041
+
1042
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1043
+ float arr[4];
1044
+
1045
+ __lsx_vst(y, arr, 0);
1046
+
1047
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1048
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1049
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1050
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1051
+ }
1052
+
1053
+ #define GGML_F32Cx4 __m128
1054
+ #define GGML_F32Cx4_ZERO __lsx_vldi(0)
1055
+ #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1056
+ #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
1057
+ #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1058
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
1059
+ #define GGML_F32Cx4_ADD __lsx_vfadd_s
1060
+ #define GGML_F32Cx4_MUL __lsx_vfmul_s
1061
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
1062
+
1063
+ #define GGML_F16_VEC GGML_F32Cx4
1064
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
1065
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
1066
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
1067
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
1068
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
1069
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
1070
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1071
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1072
+
1073
+ #elif defined(__VXE__) || defined(__VXE2__)
1074
+
1075
+ #define GGML_SIMD
1076
+
1077
+ // F32 s390x
1078
+
1079
+ #define GGML_F32_STEP 32
1080
+ #define GGML_F32_EPR 4
1081
+
1082
+ #define GGML_F32x4 float32x4_t
1083
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
1084
+ #define GGML_F32x4_SET1 vec_splats
1085
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
1086
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1087
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1088
+ #define GGML_F32x4_ADD vec_add
1089
+ #define GGML_F32x4_MUL vec_mul
1090
+ #define GGML_F32x4_REDUCE(res, x) \
1091
+ { \
1092
+ int offset = GGML_F32_ARR >> 1; \
1093
+ for (int i = 0; i < offset; ++i) { \
1094
+ x[i] = vec_add(x[i], x[offset + i]); \
1095
+ } \
1096
+ offset >>= 1; \
1097
+ for (int i = 0; i < offset; ++i) { \
1098
+ x[i] = vec_add(x[i], x[offset + i]); \
1099
+ } \
1100
+ offset >>= 1; \
1101
+ for (int i = 0; i < offset; ++i) { \
1102
+ x[i] = vec_add(x[i], x[offset + i]); \
1103
+ } \
1104
+ float32x4_t tmp = x[0] + vec_reve(x[0]); \
1105
+ res = tmp[0] + tmp[1]; \
1106
+ }
1107
+
1108
+ #define GGML_F32_VEC GGML_F32x4
1109
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1110
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1111
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1112
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1113
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1114
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1115
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1116
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1117
+
1118
+ // F16 s390x
1119
+ #define GGML_F16_STEP GGML_F32_STEP
1120
+ #define GGML_F16_EPR GGML_F32_EPR
1121
+
1122
+ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1123
+ #if defined(__NNPA__)
1124
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1125
+ uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1126
+ return vec_extend_to_fp32_hi(v_xd, 0);
1127
+ #else
1128
+ float tmp[4];
1129
+
1130
+ for (int i = 0; i < 4; i++) {
1131
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1132
+ }
1133
+
1134
+ // note: keep type-cast here to prevent compiler bugs
1135
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
1136
+ return vec_xl(0, (const float *)(tmp));
1137
+ #endif
1138
+ }
1139
+
1140
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1141
+ #if defined(__NNPA__)
1142
+ float32x4_t v_zero = vec_splats(0.0f);
1143
+ uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1144
+ uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1145
+
1146
+ x[0] = vec_extract(v_x, 0);
1147
+ x[1] = vec_extract(v_x, 1);
1148
+ x[2] = vec_extract(v_x, 2);
1149
+ x[3] = vec_extract(v_x, 3);
1150
+ #else
1151
+ float arr[4];
1152
+
1153
+ // note: keep type-cast here to prevent compiler bugs
1154
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
1155
+ vec_xst(v_y, 0, (float *)(arr));
1156
+
1157
+ for (int i = 0; i < 4; i++) {
1158
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1159
+ }
1160
+ #endif
1161
+ }
1162
+
1163
+ #define GGML_F16_VEC GGML_F32x4
1164
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1165
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1166
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1167
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1168
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1169
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1170
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1171
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1172
+
1173
+ #endif
1174
+
1175
+ // GGML_F32_ARR / GGML_F16_ARR
1176
+ // number of registers to use per step
1177
+ #ifdef GGML_SIMD
1178
+ #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1179
+ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1180
+ #endif
1181
+
1182
+ #ifdef __cplusplus
1183
+ }
1184
+ #endif