whispercpp 1.3.1 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (857) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -3
  3. data/README.md +161 -43
  4. data/Rakefile +45 -13
  5. data/ext/.gitignore +4 -8
  6. data/ext/dependencies.rb +73 -0
  7. data/ext/extconf.rb +21 -198
  8. data/ext/options.rb +85 -0
  9. data/ext/ruby_whisper.c +177 -0
  10. data/ext/ruby_whisper.h +17 -2
  11. data/ext/ruby_whisper_context.c +672 -0
  12. data/ext/ruby_whisper_error.c +52 -0
  13. data/ext/ruby_whisper_model.c +232 -0
  14. data/ext/ruby_whisper_params.c +1303 -0
  15. data/ext/ruby_whisper_segment.c +220 -0
  16. data/ext/ruby_whisper_transcribe.cpp +93 -0
  17. data/ext/ruby_whisper_vad_params.c +288 -0
  18. data/ext/sources/CMakeGraphVizOptions.cmake +8 -0
  19. data/ext/sources/CMakeLists.txt +255 -0
  20. data/ext/sources/bindings/javascript/CMakeLists.txt +41 -0
  21. data/ext/sources/bindings/javascript/emscripten.cpp +93 -0
  22. data/ext/sources/bindings/javascript/libwhisper.worker.js +1 -0
  23. data/ext/sources/bindings/javascript/package-tmpl.json +26 -0
  24. data/ext/sources/bindings/javascript/package.json +26 -0
  25. data/ext/sources/bindings/javascript/whisper.js +19 -0
  26. data/ext/sources/build-xcframework.sh +547 -0
  27. data/ext/sources/cmake/DefaultTargetOptions.cmake +16 -0
  28. data/ext/sources/cmake/FindFFmpeg.cmake +163 -0
  29. data/ext/sources/cmake/build-info.cmake +60 -0
  30. data/ext/sources/cmake/git-vars.cmake +22 -0
  31. data/ext/sources/cmake/whisper-config.cmake.in +65 -0
  32. data/ext/sources/cmake/whisper.pc.in +10 -0
  33. data/ext/sources/examples/CMakeLists.txt +124 -0
  34. data/ext/sources/examples/addon.node/CMakeLists.txt +31 -0
  35. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +133 -0
  36. data/ext/sources/examples/addon.node/addon.cpp +557 -0
  37. data/ext/sources/examples/addon.node/index.js +57 -0
  38. data/ext/sources/examples/addon.node/package.json +16 -0
  39. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  40. data/ext/sources/examples/bench/CMakeLists.txt +8 -0
  41. data/ext/sources/examples/bench/bench.cpp +176 -0
  42. data/ext/sources/examples/bench.wasm/CMakeLists.txt +49 -0
  43. data/ext/sources/examples/bench.wasm/emscripten.cpp +87 -0
  44. data/ext/sources/examples/bench.wasm/index-tmpl.html +284 -0
  45. data/ext/sources/examples/cli/CMakeLists.txt +8 -0
  46. data/ext/sources/examples/cli/cli.cpp +1295 -0
  47. data/ext/sources/examples/coi-serviceworker.js +146 -0
  48. data/ext/sources/examples/command/CMakeLists.txt +10 -0
  49. data/ext/sources/examples/command/command.cpp +800 -0
  50. data/ext/sources/examples/command/commands.txt +9 -0
  51. data/ext/sources/examples/command.wasm/CMakeLists.txt +50 -0
  52. data/ext/sources/examples/command.wasm/emscripten.cpp +327 -0
  53. data/ext/sources/examples/command.wasm/index-tmpl.html +414 -0
  54. data/ext/sources/examples/common-ggml.cpp +238 -0
  55. data/ext/sources/examples/common-ggml.h +18 -0
  56. data/ext/sources/examples/common-sdl.cpp +227 -0
  57. data/ext/sources/examples/common-sdl.h +49 -0
  58. data/ext/sources/examples/common-whisper.cpp +175 -0
  59. data/ext/sources/examples/common-whisper.h +24 -0
  60. data/ext/sources/examples/common.cpp +675 -0
  61. data/ext/sources/examples/common.h +322 -0
  62. data/ext/sources/examples/deprecation-warning/CMakeLists.txt +6 -0
  63. data/ext/sources/examples/deprecation-warning/deprecation-warning.cpp +38 -0
  64. data/ext/sources/examples/ffmpeg-transcode.cpp +368 -0
  65. data/ext/sources/examples/generate-karaoke.sh +57 -0
  66. data/ext/sources/examples/grammar-parser.cpp +423 -0
  67. data/ext/sources/examples/grammar-parser.h +29 -0
  68. data/ext/sources/examples/helpers.js +191 -0
  69. data/ext/sources/examples/json.hpp +24596 -0
  70. data/ext/sources/examples/livestream.sh +112 -0
  71. data/ext/sources/examples/lsp/CMakeLists.txt +9 -0
  72. data/ext/sources/examples/lsp/lsp.cpp +469 -0
  73. data/ext/sources/examples/lsp/whisper.vim +362 -0
  74. data/ext/sources/examples/miniaudio.h +93468 -0
  75. data/ext/sources/examples/python/test_whisper_processor.py +7 -0
  76. data/ext/sources/examples/python/whisper_processor.py +54 -0
  77. data/ext/sources/examples/quantize/CMakeLists.txt +6 -0
  78. data/ext/sources/examples/quantize/quantize.cpp +226 -0
  79. data/ext/sources/examples/server/CMakeLists.txt +15 -0
  80. data/ext/sources/examples/server/bench.js +29 -0
  81. data/ext/sources/examples/server/httplib.h +10497 -0
  82. data/ext/sources/examples/server/server.cpp +1238 -0
  83. data/ext/sources/examples/server.py +115 -0
  84. data/ext/sources/examples/stb_vorbis.c +5584 -0
  85. data/ext/sources/examples/stream/CMakeLists.txt +10 -0
  86. data/ext/sources/examples/stream/stream.cpp +435 -0
  87. data/ext/sources/examples/stream.wasm/CMakeLists.txt +49 -0
  88. data/ext/sources/examples/stream.wasm/emscripten.cpp +216 -0
  89. data/ext/sources/examples/stream.wasm/index-tmpl.html +414 -0
  90. data/ext/sources/examples/sycl/CMakeLists.txt +9 -0
  91. data/ext/sources/examples/sycl/build.sh +22 -0
  92. data/ext/sources/examples/sycl/ls-sycl-device.cpp +11 -0
  93. data/ext/sources/examples/sycl/run-whisper.sh +17 -0
  94. data/ext/sources/examples/talk-llama/CMakeLists.txt +43 -0
  95. data/ext/sources/examples/talk-llama/eleven-labs.py +80 -0
  96. data/ext/sources/examples/talk-llama/llama-adapter.cpp +388 -0
  97. data/ext/sources/examples/talk-llama/llama-adapter.h +76 -0
  98. data/ext/sources/examples/talk-llama/llama-arch.cpp +1914 -0
  99. data/ext/sources/examples/talk-llama/llama-arch.h +464 -0
  100. data/ext/sources/examples/talk-llama/llama-batch.cpp +843 -0
  101. data/ext/sources/examples/talk-llama/llama-batch.h +147 -0
  102. data/ext/sources/examples/talk-llama/llama-chat.cpp +685 -0
  103. data/ext/sources/examples/talk-llama/llama-chat.h +59 -0
  104. data/ext/sources/examples/talk-llama/llama-context.cpp +2845 -0
  105. data/ext/sources/examples/talk-llama/llama-context.h +297 -0
  106. data/ext/sources/examples/talk-llama/llama-cparams.cpp +5 -0
  107. data/ext/sources/examples/talk-llama/llama-cparams.h +41 -0
  108. data/ext/sources/examples/talk-llama/llama-grammar.cpp +1229 -0
  109. data/ext/sources/examples/talk-llama/llama-grammar.h +173 -0
  110. data/ext/sources/examples/talk-llama/llama-graph.cpp +1693 -0
  111. data/ext/sources/examples/talk-llama/llama-graph.h +710 -0
  112. data/ext/sources/examples/talk-llama/llama-hparams.cpp +103 -0
  113. data/ext/sources/examples/talk-llama/llama-hparams.h +207 -0
  114. data/ext/sources/examples/talk-llama/llama-impl.cpp +167 -0
  115. data/ext/sources/examples/talk-llama/llama-impl.h +61 -0
  116. data/ext/sources/examples/talk-llama/llama-io.cpp +15 -0
  117. data/ext/sources/examples/talk-llama/llama-io.h +35 -0
  118. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  119. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  120. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  121. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  122. data/ext/sources/examples/talk-llama/llama-kv-cache.h +44 -0
  123. data/ext/sources/examples/talk-llama/llama-kv-cells.h +439 -0
  124. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  125. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  126. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  127. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  128. data/ext/sources/examples/talk-llama/llama-memory.cpp +59 -0
  129. data/ext/sources/examples/talk-llama/llama-memory.h +116 -0
  130. data/ext/sources/examples/talk-llama/llama-mmap.cpp +600 -0
  131. data/ext/sources/examples/talk-llama/llama-mmap.h +68 -0
  132. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +1163 -0
  133. data/ext/sources/examples/talk-llama/llama-model-loader.h +169 -0
  134. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +282 -0
  135. data/ext/sources/examples/talk-llama/llama-model-saver.h +37 -0
  136. data/ext/sources/examples/talk-llama/llama-model.cpp +15114 -0
  137. data/ext/sources/examples/talk-llama/llama-model.h +452 -0
  138. data/ext/sources/examples/talk-llama/llama-quant.cpp +1049 -0
  139. data/ext/sources/examples/talk-llama/llama-quant.h +1 -0
  140. data/ext/sources/examples/talk-llama/llama-sampling.cpp +2575 -0
  141. data/ext/sources/examples/talk-llama/llama-sampling.h +32 -0
  142. data/ext/sources/examples/talk-llama/llama-vocab.cpp +3377 -0
  143. data/ext/sources/examples/talk-llama/llama-vocab.h +132 -0
  144. data/ext/sources/examples/talk-llama/llama.cpp +358 -0
  145. data/ext/sources/examples/talk-llama/llama.h +1484 -0
  146. data/ext/sources/examples/talk-llama/prompts/talk-alpaca.txt +23 -0
  147. data/ext/sources/examples/talk-llama/speak +40 -0
  148. data/ext/sources/examples/talk-llama/speak.bat +1 -0
  149. data/ext/sources/examples/talk-llama/speak.ps1 +14 -0
  150. data/ext/sources/examples/talk-llama/talk-llama.cpp +810 -0
  151. data/ext/sources/examples/talk-llama/unicode-data.cpp +7034 -0
  152. data/ext/sources/examples/talk-llama/unicode-data.h +20 -0
  153. data/ext/sources/examples/talk-llama/unicode.cpp +854 -0
  154. data/ext/sources/examples/talk-llama/unicode.h +66 -0
  155. data/ext/sources/examples/vad-speech-segments/CMakeLists.txt +8 -0
  156. data/ext/sources/examples/vad-speech-segments/speech.cpp +149 -0
  157. data/ext/sources/examples/wchess/CMakeLists.txt +10 -0
  158. data/ext/sources/examples/wchess/libwchess/CMakeLists.txt +19 -0
  159. data/ext/sources/examples/wchess/libwchess/Chessboard.cpp +803 -0
  160. data/ext/sources/examples/wchess/libwchess/Chessboard.h +33 -0
  161. data/ext/sources/examples/wchess/libwchess/WChess.cpp +193 -0
  162. data/ext/sources/examples/wchess/libwchess/WChess.h +63 -0
  163. data/ext/sources/examples/wchess/libwchess/test-chessboard.cpp +117 -0
  164. data/ext/sources/examples/wchess/wchess.cmd/CMakeLists.txt +8 -0
  165. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +251 -0
  166. data/ext/sources/examples/whisper.wasm/CMakeLists.txt +50 -0
  167. data/ext/sources/examples/whisper.wasm/emscripten.cpp +118 -0
  168. data/ext/sources/examples/whisper.wasm/index-tmpl.html +658 -0
  169. data/ext/sources/ggml/CMakeLists.txt +435 -0
  170. data/ext/sources/ggml/cmake/BuildTypes.cmake +54 -0
  171. data/ext/sources/ggml/cmake/GitVars.cmake +22 -0
  172. data/ext/sources/ggml/cmake/common.cmake +50 -0
  173. data/ext/sources/ggml/cmake/ggml-config.cmake.in +152 -0
  174. data/ext/{ggml → sources/ggml}/include/ggml-alloc.h +1 -1
  175. data/ext/{ggml → sources/ggml}/include/ggml-backend.h +10 -8
  176. data/ext/{ggml → sources/ggml}/include/ggml-cpp.h +2 -1
  177. data/ext/{ggml → sources/ggml}/include/ggml-cpu.h +11 -1
  178. data/ext/{ggml → sources/ggml}/include/ggml-metal.h +1 -1
  179. data/ext/{ggml → sources/ggml}/include/ggml-opt.h +49 -28
  180. data/ext/{ggml → sources/ggml}/include/ggml-rpc.h +6 -1
  181. data/ext/{ggml → sources/ggml}/include/ggml-vulkan.h +0 -2
  182. data/ext/{ggml → sources/ggml}/include/ggml.h +325 -269
  183. data/ext/sources/ggml/include/gguf.h +202 -0
  184. data/ext/sources/ggml/src/CMakeLists.txt +404 -0
  185. data/ext/{ggml → sources/ggml}/src/ggml-alloc.c +34 -29
  186. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  187. data/ext/{ggml → sources/ggml}/src/ggml-backend-impl.h +1 -2
  188. data/ext/{ggml → sources/ggml}/src/ggml-backend-reg.cpp +92 -53
  189. data/ext/{ggml → sources/ggml}/src/ggml-backend.cpp +69 -34
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  191. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +75 -0
  192. data/ext/sources/ggml/src/ggml-cann/Doxyfile +2579 -0
  193. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.cpp +10 -4
  194. data/ext/{ggml → sources/ggml}/src/ggml-cann/acl_tensor.h +5 -5
  195. data/ext/{ggml → sources/ggml}/src/ggml-cann/aclnn_ops.cpp +1272 -1506
  196. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +1125 -0
  197. data/ext/{ggml → sources/ggml}/src/ggml-cann/common.h +140 -1
  198. data/ext/{ggml → sources/ggml}/src/ggml-cann/ggml-cann.cpp +588 -146
  199. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +30 -0
  200. data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/dup.cpp +3 -5
  201. data/ext/{ggml → sources/ggml}/src/ggml-common.h +16 -8
  202. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +597 -0
  203. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.cpp +3 -2
  204. data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.cpp +11 -10
  205. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  208. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  209. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  210. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  211. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  212. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  213. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  214. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  215. data/ext/{ggml/src/ggml-cpu/cpu-feats-x86.cpp → sources/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp} +5 -1
  216. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  217. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +3285 -0
  218. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  219. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  220. data/ext/sources/ggml/src/ggml-cpu/binary-ops.h +16 -0
  221. data/ext/sources/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  222. data/ext/sources/ggml/src/ggml-cpu/common.h +73 -0
  223. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu-impl.h +172 -41
  224. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +3551 -0
  225. data/ext/{ggml → sources/ggml}/src/ggml-cpu/ggml-cpu.cpp +78 -25
  226. data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.cpp → sources/ggml/src/ggml-cpu/hbm.cpp} +1 -1
  227. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +337 -0
  228. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +95 -0
  229. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +482 -0
  230. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  231. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3594 -0
  232. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +19 -0
  233. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +9786 -0
  234. data/ext/sources/ggml/src/ggml-cpu/ops.h +118 -0
  235. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  236. data/ext/{ggml/src/ggml-cpu/ggml-cpu-quants.h → sources/ggml/src/ggml-cpu/quants.h} +26 -0
  237. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  238. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  239. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +1184 -0
  240. data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.cpp → sources/ggml/src/ggml-cpu/traits.cpp} +1 -1
  241. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  242. data/ext/sources/ggml/src/ggml-cpu/unary-ops.h +28 -0
  243. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +345 -0
  244. data/ext/sources/ggml/src/ggml-cpu/vec.h +1027 -0
  245. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +184 -0
  246. data/ext/sources/ggml/src/ggml-cuda/acc.cu +61 -0
  247. data/ext/sources/ggml/src/ggml-cuda/acc.cuh +5 -0
  248. data/ext/sources/ggml/src/ggml-cuda/arange.cu +34 -0
  249. data/ext/sources/ggml/src/ggml-cuda/arange.cuh +5 -0
  250. data/ext/sources/ggml/src/ggml-cuda/argmax.cu +91 -0
  251. data/ext/sources/ggml/src/ggml-cuda/argmax.cuh +3 -0
  252. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +104 -0
  253. data/ext/sources/ggml/src/ggml-cuda/argsort.cuh +3 -0
  254. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +363 -0
  255. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +9 -0
  256. data/ext/sources/ggml/src/ggml-cuda/clamp.cu +45 -0
  257. data/ext/sources/ggml/src/ggml-cuda/clamp.cuh +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/common.cuh +851 -0
  259. data/ext/sources/ggml/src/ggml-cuda/concat.cu +221 -0
  260. data/ext/sources/ggml/src/ggml-cuda/concat.cuh +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +89 -0
  262. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  264. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  266. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  267. data/ext/sources/ggml/src/ggml-cuda/convert.cu +752 -0
  268. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +31 -0
  269. data/ext/sources/ggml/src/ggml-cuda/count-equal.cu +64 -0
  270. data/ext/sources/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  271. data/ext/sources/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  272. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +705 -0
  273. data/ext/sources/ggml/src/ggml-cuda/cpy.cuh +11 -0
  274. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +189 -0
  275. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  276. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +103 -0
  277. data/ext/sources/ggml/src/ggml-cuda/diagmask.cu +40 -0
  278. data/ext/sources/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  279. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +881 -0
  280. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1474 -0
  281. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +357 -0
  282. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +3 -0
  283. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +365 -0
  284. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +3 -0
  285. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +482 -0
  286. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +472 -0
  287. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +638 -0
  288. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +3 -0
  289. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +346 -0
  290. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +3 -0
  291. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +275 -0
  292. data/ext/sources/ggml/src/ggml-cuda/getrows.cuh +15 -0
  293. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +3647 -0
  294. data/ext/sources/ggml/src/ggml-cuda/gla.cu +93 -0
  295. data/ext/sources/ggml/src/ggml-cuda/gla.cuh +3 -0
  296. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +103 -0
  297. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +5 -0
  298. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  299. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  300. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +396 -0
  301. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +324 -0
  302. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +3217 -0
  303. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +506 -0
  304. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +11 -0
  305. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +595 -0
  306. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +12 -0
  307. data/ext/sources/ggml/src/ggml-cuda/norm.cu +458 -0
  308. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +11 -0
  309. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  310. data/ext/sources/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  311. data/ext/sources/ggml/src/ggml-cuda/out-prod.cu +68 -0
  312. data/ext/sources/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  313. data/ext/sources/ggml/src/ggml-cuda/pad.cu +49 -0
  314. data/ext/sources/ggml/src/ggml-cuda/pad.cuh +5 -0
  315. data/ext/sources/ggml/src/ggml-cuda/pool2d.cu +94 -0
  316. data/ext/sources/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  317. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +190 -0
  318. data/ext/sources/ggml/src/ggml-cuda/quantize.cuh +27 -0
  319. data/ext/sources/ggml/src/ggml-cuda/rope.cu +456 -0
  320. data/ext/sources/ggml/src/ggml-cuda/rope.cuh +7 -0
  321. data/ext/sources/ggml/src/ggml-cuda/scale.cu +31 -0
  322. data/ext/sources/ggml/src/ggml-cuda/scale.cuh +5 -0
  323. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +283 -0
  324. data/ext/sources/ggml/src/ggml-cuda/softmax.cuh +7 -0
  325. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +148 -0
  326. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  327. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +155 -0
  328. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  329. data/ext/sources/ggml/src/ggml-cuda/sum.cu +45 -0
  330. data/ext/sources/ggml/src/ggml-cuda/sum.cuh +5 -0
  331. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +26 -0
  332. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  333. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +5 -0
  334. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +10 -0
  335. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  336. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  337. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +10 -0
  338. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +5 -0
  339. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +10 -0
  340. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +10 -0
  341. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  342. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  343. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +5 -0
  344. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  345. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +10 -0
  346. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +10 -0
  347. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  348. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  349. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  350. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +10 -0
  351. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +10 -0
  352. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
  353. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
  354. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
  355. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
  356. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
  357. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
  358. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
  359. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
  360. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
  361. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
  362. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
  363. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
  364. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
  365. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
  366. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
  367. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
  368. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
  369. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
  370. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
  371. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
  372. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
  373. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
  374. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
  375. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
  376. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
  377. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
  378. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
  379. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
  380. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
  381. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
  382. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
  383. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
  384. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
  385. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
  386. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
  387. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
  388. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
  389. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
  390. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
  391. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
  392. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
  393. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
  394. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
  395. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
  396. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
  397. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
  398. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
  399. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
  400. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
  401. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
  402. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
  403. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
  404. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
  405. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
  406. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
  407. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
  408. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
  409. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
  410. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
  411. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
  412. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
  413. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
  414. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
  415. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
  416. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
  417. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
  418. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
  419. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
  420. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
  421. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
  422. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
  423. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
  424. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
  425. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
  426. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
  427. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
  428. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
  429. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
  430. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
  431. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
  432. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
  433. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
  434. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
  435. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
  436. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
  437. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
  438. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +78 -0
  439. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  440. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  441. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  442. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  443. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  444. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  445. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  446. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  447. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  448. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  449. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  450. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  451. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  452. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  453. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  454. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  455. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  456. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  457. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +47 -0
  458. data/ext/sources/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  459. data/ext/sources/ggml/src/ggml-cuda/unary.cu +378 -0
  460. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +66 -0
  461. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +51 -0
  462. data/ext/sources/ggml/src/ggml-cuda/upscale.cuh +5 -0
  463. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +1135 -0
  464. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/cuda.h +1 -0
  465. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/hip.h +57 -0
  466. data/ext/{ggml → sources/ggml}/src/ggml-cuda/vendors/musa.h +7 -1
  467. data/ext/sources/ggml/src/ggml-cuda/wkv.cu +199 -0
  468. data/ext/sources/ggml/src/ggml-cuda/wkv.cuh +7 -0
  469. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +135 -0
  470. data/ext/{ggml → sources/ggml}/src/ggml-impl.h +147 -158
  471. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  472. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +112 -0
  473. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +58 -0
  474. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +25 -0
  475. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +52 -0
  476. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +52 -0
  477. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +52 -0
  478. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +52 -0
  479. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +30 -0
  480. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +22 -0
  481. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +17 -0
  482. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +31 -0
  483. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +31 -0
  484. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +38 -0
  485. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +39 -0
  486. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +44 -0
  487. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +52 -0
  488. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +69 -0
  489. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +51 -0
  490. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +33 -0
  491. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +35 -0
  492. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +140 -0
  493. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +106 -0
  494. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +73 -0
  495. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +52 -0
  496. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +28 -0
  497. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +84 -0
  498. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +21 -0
  499. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +53 -0
  500. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
  501. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
  502. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +52 -0
  503. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +52 -0
  504. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +19 -0
  505. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +23 -0
  506. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +22 -0
  507. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +72 -0
  508. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +71 -0
  509. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +121 -0
  510. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +649 -0
  511. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.m +2504 -1108
  512. data/ext/{ggml → sources/ggml}/src/ggml-metal/ggml-metal.metal +2102 -1463
  513. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +113 -0
  514. data/ext/sources/ggml/src/ggml-musa/mudnn.cu +112 -0
  515. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +12 -0
  516. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +110 -0
  517. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +6494 -0
  518. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +83 -0
  519. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  520. data/ext/sources/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  521. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  522. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  523. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +118 -0
  524. data/ext/sources/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  525. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  526. data/ext/sources/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  527. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +62 -0
  528. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  529. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  530. data/ext/sources/ggml/src/ggml-opencl/kernels/get_rows.cl +163 -0
  531. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  532. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  533. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  534. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  535. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +79 -0
  536. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  537. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  538. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  539. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  540. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  541. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  542. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  543. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  544. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  545. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  546. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  547. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  548. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl +190 -0
  549. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +81 -0
  550. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  551. data/ext/sources/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  552. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  553. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +96 -0
  554. data/ext/sources/ggml/src/ggml-opencl/kernels/rope.cl +721 -0
  555. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +16 -0
  556. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  557. data/ext/sources/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  558. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +87 -0
  559. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +87 -0
  560. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +86 -0
  561. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +86 -0
  562. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  563. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  564. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  565. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +84 -0
  566. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  567. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  568. data/ext/{ggml → sources/ggml}/src/ggml-opt.cpp +373 -190
  569. data/ext/{ggml → sources/ggml}/src/ggml-quants.c +120 -128
  570. data/ext/sources/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  571. data/ext/{ggml → sources/ggml}/src/ggml-rpc/ggml-rpc.cpp +494 -84
  572. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +189 -0
  573. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +37 -0
  574. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +344 -0
  575. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  576. data/ext/{ggml → sources/ggml}/src/ggml-sycl/common.cpp +20 -32
  577. data/ext/sources/ggml/src/ggml-sycl/common.hpp +561 -0
  578. data/ext/{ggml → sources/ggml}/src/ggml-sycl/concat.cpp +56 -70
  579. data/ext/sources/ggml/src/ggml-sycl/concat.hpp +20 -0
  580. data/ext/{ggml → sources/ggml}/src/ggml-sycl/conv.cpp +8 -12
  581. data/ext/sources/ggml/src/ggml-sycl/conv.hpp +20 -0
  582. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +575 -0
  583. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +34 -0
  584. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +839 -0
  585. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +11 -0
  586. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +823 -0
  587. data/ext/{ggml → sources/ggml}/src/ggml-sycl/dmmv.cpp +188 -67
  588. data/ext/sources/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  589. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +2987 -0
  590. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +1120 -0
  591. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +84 -0
  592. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +102 -0
  593. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +212 -0
  594. data/ext/sources/ggml/src/ggml-sycl/getrows.hpp +20 -0
  595. data/ext/{ggml → sources/ggml}/src/ggml-sycl/ggml-sycl.cpp +1197 -1295
  596. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +106 -0
  597. data/ext/sources/ggml/src/ggml-sycl/gla.hpp +8 -0
  598. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +136 -0
  599. data/ext/sources/ggml/src/ggml-sycl/im2col.hpp +21 -0
  600. data/ext/{ggml → sources/ggml}/src/ggml-sycl/mmq.cpp +60 -81
  601. data/ext/sources/ggml/src/ggml-sycl/mmq.hpp +33 -0
  602. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +1065 -0
  603. data/ext/sources/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  604. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +482 -0
  605. data/ext/sources/ggml/src/ggml-sycl/norm.hpp +26 -0
  606. data/ext/{ggml → sources/ggml}/src/ggml-sycl/outprod.cpp +8 -17
  607. data/ext/sources/ggml/src/ggml-sycl/outprod.hpp +10 -0
  608. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +74 -0
  609. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +111 -0
  610. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +472 -0
  611. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +20 -0
  612. data/ext/{ggml → sources/ggml}/src/ggml-sycl/softmax.cpp +38 -28
  613. data/ext/sources/ggml/src/ggml-sycl/softmax.hpp +20 -0
  614. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +15 -0
  615. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +26 -0
  616. data/ext/{ggml → sources/ggml}/src/ggml-sycl/tsembd.cpp +6 -11
  617. data/ext/sources/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  618. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +1307 -0
  619. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +289 -0
  620. data/ext/sources/ggml/src/ggml-sycl/wkv.hpp +10 -0
  621. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +200 -0
  622. data/ext/sources/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  623. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/ggml-vulkan.cpp +3822 -1335
  624. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  625. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +29 -0
  626. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +29 -0
  627. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +51 -0
  628. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +69 -0
  629. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  630. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  631. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  632. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  633. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  634. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  635. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  636. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +242 -0
  637. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  638. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  639. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  640. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +462 -0
  641. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +699 -0
  642. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.comp +13 -0
  643. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  644. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  645. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  646. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  647. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +48 -0
  648. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +39 -0
  649. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +49 -0
  650. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  651. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  652. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  653. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  654. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  655. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  656. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  657. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  658. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  659. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  660. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  661. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  662. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  663. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  664. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +337 -0
  665. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  666. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  667. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +267 -0
  668. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +59 -0
  669. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  670. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  671. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  672. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +64 -0
  673. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.comp +9 -0
  674. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +76 -0
  675. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +33 -0
  676. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +41 -0
  677. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  678. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  679. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  680. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +100 -0
  681. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +41 -0
  682. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  683. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  684. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  685. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  686. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +118 -0
  687. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +82 -0
  688. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +79 -0
  689. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  690. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +87 -0
  691. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  692. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  693. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  694. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +118 -0
  695. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +154 -0
  696. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +130 -0
  697. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  698. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +136 -0
  699. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +167 -0
  700. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  701. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +868 -0
  702. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +441 -0
  703. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +442 -0
  704. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +99 -0
  705. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  706. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  707. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +28 -0
  708. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  709. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +77 -0
  710. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  711. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  712. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  713. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  714. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +61 -0
  715. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  716. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +58 -0
  717. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +60 -0
  718. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +43 -0
  719. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +43 -0
  720. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +47 -0
  721. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  722. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  723. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  724. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  725. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  726. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +173 -0
  727. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +50 -0
  728. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  729. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  730. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +37 -0
  731. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  732. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  733. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_bfloat16_support.comp +7 -0
  734. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp +7 -0
  735. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat_support.comp +7 -0
  736. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/test_integer_dot_support.comp +7 -0
  737. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +41 -0
  738. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +1373 -0
  739. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -0
  740. data/ext/{ggml → sources/ggml}/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +203 -36
  741. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  742. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  743. data/ext/{ggml → sources/ggml}/src/ggml.c +918 -1782
  744. data/ext/sources/ggml/src/ggml.cpp +26 -0
  745. data/ext/sources/ggml/src/gguf.cpp +1351 -0
  746. data/ext/{include → sources/include}/whisper.h +70 -2
  747. data/ext/sources/src/CMakeLists.txt +145 -0
  748. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  749. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  750. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.h +27 -15
  751. data/ext/{src → sources/src}/coreml/whisper-decoder-impl.m +36 -10
  752. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.h +21 -9
  753. data/ext/{src → sources/src}/coreml/whisper-encoder-impl.m +29 -3
  754. data/ext/sources/src/coreml/whisper-encoder.mm +73 -0
  755. data/ext/sources/src/whisper-arch.h +197 -0
  756. data/ext/{src → sources/src}/whisper.cpp +1966 -386
  757. data/ext/sources/tests/CMakeLists.txt +105 -0
  758. data/ext/sources/tests/earnings21/eval.mk +58 -0
  759. data/ext/sources/tests/earnings21/eval.py +68 -0
  760. data/ext/sources/tests/earnings21/normalizers/__init__.py +2 -0
  761. data/ext/sources/tests/earnings21/normalizers/basic.py +80 -0
  762. data/ext/sources/tests/earnings21/normalizers/english.json +1741 -0
  763. data/ext/sources/tests/earnings21/normalizers/english.py +550 -0
  764. data/ext/sources/tests/earnings21/requirements.txt +6 -0
  765. data/ext/sources/tests/en-0-ref.txt +1 -0
  766. data/ext/sources/tests/en-1-ref.txt +1 -0
  767. data/ext/sources/tests/en-2-ref.txt +1 -0
  768. data/ext/sources/tests/es-0-ref.txt +1 -0
  769. data/ext/sources/tests/librispeech/eval.mk +39 -0
  770. data/ext/sources/tests/librispeech/eval.py +47 -0
  771. data/ext/sources/tests/librispeech/normalizers/__init__.py +2 -0
  772. data/ext/sources/tests/librispeech/normalizers/basic.py +80 -0
  773. data/ext/sources/tests/librispeech/normalizers/english.json +1741 -0
  774. data/ext/sources/tests/librispeech/normalizers/english.py +550 -0
  775. data/ext/sources/tests/librispeech/requirements.txt +6 -0
  776. data/ext/sources/tests/run-tests.sh +130 -0
  777. data/ext/sources/tests/test-c.c +3 -0
  778. data/ext/sources/tests/test-vad-full.cpp +54 -0
  779. data/ext/sources/tests/test-vad.cpp +83 -0
  780. data/ext/sources/tests/test-whisper.js +58 -0
  781. data/extsources.rb +39 -5
  782. data/lib/whisper/context.rb +15 -0
  783. data/lib/whisper/model/uri.rb +202 -126
  784. data/lib/whisper/segment.rb +58 -0
  785. data/sig/whisper.rbs +510 -0
  786. data/test/helper.rb +24 -0
  787. data/{tests → test}/test_callback.rb +45 -3
  788. data/{tests → test}/test_error.rb +2 -2
  789. data/{tests → test}/test_model.rb +47 -0
  790. data/test/test_package.rb +51 -0
  791. data/test/test_params.rb +297 -0
  792. data/test/test_segment.rb +146 -0
  793. data/test/test_vad.rb +19 -0
  794. data/test/test_vad_params.rb +103 -0
  795. data/{tests → test}/test_whisper.rb +106 -36
  796. data/whispercpp.gemspec +5 -5
  797. metadata +837 -134
  798. data/ext/cpu.mk +0 -9
  799. data/ext/examples/dr_wav.h +0 -8815
  800. data/ext/ggml/src/ggml-cann/aclnn_ops.h +0 -592
  801. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -4262
  802. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  803. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -10835
  804. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +0 -14123
  805. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +0 -1884
  806. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +0 -14
  807. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +0 -288
  808. data/ext/ggml/src/ggml-sycl/convert.cpp +0 -547
  809. data/ext/ggml/src/ggml-sycl/element_wise.cpp +0 -1030
  810. data/ext/ggml/src/ggml-sycl/im2col.cpp +0 -126
  811. data/ext/ggml/src/ggml-sycl/mmvq.cpp +0 -1015
  812. data/ext/ggml/src/ggml-sycl/norm.cpp +0 -378
  813. data/ext/ggml/src/ggml-sycl/rope.cpp +0 -276
  814. data/ext/ggml/src/ggml-sycl/wkv6.cpp +0 -141
  815. data/ext/metal-embed.mk +0 -17
  816. data/ext/metal.mk +0 -6
  817. data/ext/ruby_whisper.cpp +0 -1909
  818. data/ext/scripts/get-flags.mk +0 -38
  819. data/lib/whisper.rb +0 -2
  820. data/tests/helper.rb +0 -7
  821. data/tests/test_package.rb +0 -31
  822. data/tests/test_params.rb +0 -160
  823. data/tests/test_segment.rb +0 -83
  824. /data/ext/{ggml → sources/ggml}/include/ggml-blas.h +0 -0
  825. /data/ext/{ggml → sources/ggml}/include/ggml-cann.h +0 -0
  826. /data/ext/{ggml → sources/ggml}/include/ggml-cuda.h +0 -0
  827. /data/ext/{ggml → sources/ggml}/include/ggml-kompute.h +0 -0
  828. /data/ext/{ggml → sources/ggml}/include/ggml-opencl.h +0 -0
  829. /data/ext/{ggml → sources/ggml}/include/ggml-sycl.h +0 -0
  830. /data/ext/{ggml → sources/ggml}/src/ggml-amx/common.h +0 -0
  831. /data/ext/{ggml → sources/ggml}/src/ggml-amx/ggml-amx.cpp +0 -0
  832. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.cpp +0 -0
  833. /data/ext/{ggml → sources/ggml}/src/ggml-amx/mmq.h +0 -0
  834. /data/ext/{ggml → sources/ggml}/src/ggml-blas/ggml-blas.cpp +0 -0
  835. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/ascendc_kernels.h +0 -0
  836. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f16.cpp +0 -0
  837. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_f32.cpp +0 -0
  838. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -0
  839. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -0
  840. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -0
  841. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -0
  842. /data/ext/{ggml → sources/ggml}/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -0
  843. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/amx.h +0 -0
  844. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/common.h +0 -0
  845. /data/ext/{ggml → sources/ggml}/src/ggml-cpu/amx/mmq.h +0 -0
  846. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-hbm.h → sources/ggml/src/ggml-cpu/hbm.h} +0 -0
  847. /data/ext/{ggml/src/ggml-cpu/ggml-cpu-traits.h → sources/ggml/src/ggml-cpu/traits.h} +0 -0
  848. /data/ext/{ggml → sources/ggml}/src/ggml-kompute/ggml-kompute.cpp +0 -0
  849. /data/ext/{ggml → sources/ggml}/src/ggml-quants.h +0 -0
  850. /data/ext/{ggml → sources/ggml}/src/ggml-threading.cpp +0 -0
  851. /data/ext/{ggml → sources/ggml}/src/ggml-threading.h +0 -0
  852. /data/ext/{src → sources/src}/coreml/whisper-encoder.h +0 -0
  853. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.cpp +0 -0
  854. /data/ext/{src → sources/src}/openvino/whisper-openvino-encoder.h +0 -0
  855. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  856. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  857. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
@@ -0,0 +1,1481 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
7
+
8
+ #include "../../quants.h"
9
+ #include "../../ggml-cpu-impl.h"
10
+
11
+ #include <math.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+ #include <float.h>
15
+ #include <stdlib.h> // for qsort
16
+ #include <stdio.h> // for GGML_ASSERT
17
+
18
+ #define GROUP_MAX_EPS 1e-15f
19
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
21
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
22
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
23
+
24
+ #define UNUSED GGML_UNUSED
25
+
26
+ #if defined(__wasm_simd128__)
27
+ #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28
+ #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29
+ #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30
+ #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31
+ #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32
+ #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33
+ #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34
+ #define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
+
36
+ // precomputed tables for expanding 8bits to 8 bytes:
37
+ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
38
+ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
+ #endif
40
+
41
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
42
+ assert(QK8_0 == 32);
43
+ assert(k % QK8_0 == 0);
44
+ const int nb = k / QK8_0;
45
+
46
+ block_q8_0 * GGML_RESTRICT y = vy;
47
+
48
+ #if defined __wasm_simd128__
49
+ for (int i = 0; i < nb; i++) {
50
+ v128_t srcv [8];
51
+ v128_t asrcv[8];
52
+ v128_t amaxv[8];
53
+
54
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
55
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
56
+
57
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
58
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
59
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
60
+
61
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
62
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
63
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
64
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
65
+
66
+ const float d = amax / ((1 << 7) - 1);
67
+ const float id = d ? 1.0f/d : 0.0f;
68
+
69
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
70
+
71
+ for (int j = 0; j < 8; j++) {
72
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
73
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
74
+
75
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
76
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
77
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
78
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
79
+ }
80
+ }
81
+ #else
82
+ GGML_UNUSED(nb);
83
+ // scalar
84
+ quantize_row_q8_0_ref(x, y, k);
85
+ #endif
86
+ }
87
+
88
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
89
+ assert(k % QK8_1 == 0);
90
+ const int nb = k / QK8_1;
91
+
92
+ block_q8_1 * GGML_RESTRICT y = vy;
93
+ #if defined __wasm_simd128__
94
+ for (int i = 0; i < nb; i++) {
95
+ v128_t srcv [8];
96
+ v128_t asrcv[8];
97
+ v128_t amaxv[8];
98
+
99
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
100
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
101
+
102
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
103
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
104
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
105
+
106
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
107
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
108
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
109
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
110
+
111
+ const float d = amax / ((1 << 7) - 1);
112
+ const float id = d ? 1.0f/d : 0.0f;
113
+
114
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
115
+
116
+ v128_t accv = wasm_i32x4_splat(0);
117
+
118
+ for (int j = 0; j < 8; j++) {
119
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
120
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
121
+
122
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
123
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
124
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
125
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
126
+
127
+ accv = wasm_i32x4_add(accv, vi);
128
+ }
129
+
130
+ y[i].s = GGML_CPU_FP32_TO_FP16(
131
+ d * (wasm_i32x4_extract_lane(accv, 0) +
132
+ wasm_i32x4_extract_lane(accv, 1) +
133
+ wasm_i32x4_extract_lane(accv, 2) +
134
+ wasm_i32x4_extract_lane(accv, 3)));
135
+ }
136
+ #else
137
+ GGML_UNUSED(nb);
138
+ // scalar
139
+ quantize_row_q8_1_ref(x, y, k);
140
+ #endif
141
+ }
142
+
143
+ //===================================== Q8_K ==============================================
144
+
145
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
146
+ #ifdef __wasm_simd128__
147
+ assert(k % QK_K == 0);
148
+ const int64_t nb = k / QK_K;
149
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
150
+
151
+ for (int i = 0; i < nb; i++) {
152
+ const float * x_block = x + i * QK_K;
153
+
154
+ v128_t min_vec = wasm_v128_load(x_block);
155
+ v128_t max_vec = min_vec;
156
+
157
+ for (int j = 4; j < QK_K; j += 4) {
158
+ v128_t x_vec = wasm_v128_load(x_block + j);
159
+ max_vec = wasm_f32x4_pmax(max_vec, x_vec);
160
+ min_vec = wasm_f32x4_pmin(min_vec, x_vec);
161
+ }
162
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
163
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
164
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
165
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
166
+ float max = wasm_f32x4_extract_lane(max_vec, 0);
167
+ float min = wasm_f32x4_extract_lane(min_vec, 0);
168
+ float amax = -min > max ? min : max;
169
+
170
+ if (amax == 0.0f) {
171
+ yc[i].d = 0.0f;
172
+ const v128_t zero = wasm_i8x16_splat(0);
173
+ for (int j = 0; j < QK_K; j += 16) {
174
+ wasm_v128_store(yc[i].qs + j, zero);
175
+ }
176
+ continue;
177
+ }
178
+
179
+ const float iscale = -127.0f / amax;
180
+ const v128_t scale_vec = wasm_f32x4_splat(iscale);
181
+
182
+ // Process 16 elements per iteration
183
+ for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
184
+ // Load and quantize 16 floats
185
+ v128_t x0 = wasm_v128_load(x_block + j);
186
+ v128_t x1 = wasm_v128_load(x_block + j + 4);
187
+ v128_t x2 = wasm_v128_load(x_block + j + 8);
188
+ v128_t x3 = wasm_v128_load(x_block + j + 12);
189
+
190
+ v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
191
+ v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
192
+ v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
193
+ v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
194
+
195
+ // Convert to i32 with saturation
196
+ v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
197
+ v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
198
+ v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
199
+ v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
200
+
201
+ // Pack into 16 i8 values
202
+ v128_t i8 = wasm_i8x16_narrow_i16x8(
203
+ wasm_i16x8_narrow_i32x4(i0, i1),
204
+ wasm_i16x8_narrow_i32x4(i2, i3)
205
+ );
206
+ wasm_v128_store(yc[i].qs + j, i8);
207
+
208
+ // Calculate bsums using SIMD
209
+ v128_t sum16 = wasm_i16x8_add(
210
+ wasm_i16x8_extend_low_i8x16(i8),
211
+ wasm_i16x8_extend_high_i8x16(i8)
212
+ );
213
+ v128_t sum32 = wasm_i32x4_add(
214
+ wasm_i32x4_extend_low_i16x8(sum16),
215
+ wasm_i32x4_extend_high_i16x8(sum16)
216
+ );
217
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
218
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
219
+ yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
220
+ }
221
+
222
+ yc[i].d = 1.0f / iscale;
223
+ }
224
+ #else
225
+ quantize_row_q8_K_ref(x, y, k);
226
+ #endif
227
+ }
228
+
229
+
230
+ //===================================== Dot products =================================
231
+
232
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
233
+ const int qk = QK8_0;
234
+ const int nb = n / qk;
235
+
236
+ assert(n % qk == 0);
237
+ assert(nrc == 1);
238
+ UNUSED(nrc);
239
+ UNUSED(bx);
240
+ UNUSED(by);
241
+ UNUSED(bs);
242
+
243
+ const block_q4_0 * GGML_RESTRICT x = vx;
244
+ const block_q8_0 * GGML_RESTRICT y = vy;
245
+
246
+ int ib = 0;
247
+ float sumf = 0;
248
+
249
+ #if defined __wasm_simd128__
250
+ v128_t sumv = wasm_f32x4_splat(0.0f);
251
+
252
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
253
+ const v128_t s8b = wasm_i8x16_splat(0x8);
254
+
255
+ for (; ib + 1 < nb; ib += 2) {
256
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
257
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
258
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
259
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
260
+
261
+ // Load and process x0
262
+ v128_t v0_0 = wasm_v128_load(x0->qs);
263
+ v128_t v0_0l = wasm_v128_and(v0_0, m4b);
264
+ v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
265
+ v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
266
+ v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
267
+
268
+ // Load y0 vectors
269
+ v128_t y0_l = wasm_v128_load(y0->qs);
270
+ v128_t y0_h = wasm_v128_load(y0->qs + 16);
271
+
272
+ // Extend to i16x8 and compute dot products
273
+ v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
274
+ v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
275
+ v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
276
+ v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
277
+
278
+ v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
279
+ v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
280
+ v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
281
+ v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
282
+
283
+ v128_t dp0 = wasm_i32x4_add(
284
+ wasm_i32x4_add(
285
+ wasm_i32x4_dot_i16x8(dx0l, dy0ll),
286
+ wasm_i32x4_dot_i16x8(dx0h, dy0lh)
287
+ ),
288
+ wasm_i32x4_add(
289
+ wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
290
+ wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
291
+ )
292
+ );
293
+
294
+ // Load and process x1
295
+ v128_t v0_1 = wasm_v128_load(x1->qs);
296
+ v128_t v0_1l = wasm_v128_and(v0_1, m4b);
297
+ v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
298
+ v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
299
+ v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
300
+
301
+ // Load y1 vectors
302
+ v128_t y1_l = wasm_v128_load(y1->qs);
303
+ v128_t y1_h = wasm_v128_load(y1->qs + 16);
304
+
305
+ // Extend to i16x8 and compute dot products
306
+ v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
307
+ v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
308
+ v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
309
+ v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
310
+
311
+ v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
312
+ v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
313
+ v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
314
+ v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
315
+
316
+ v128_t dp1 = wasm_i32x4_add(
317
+ wasm_i32x4_add(
318
+ wasm_i32x4_dot_i16x8(dx1l, dy1ll),
319
+ wasm_i32x4_dot_i16x8(dx1h, dy1lh)
320
+ ),
321
+ wasm_i32x4_add(
322
+ wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
323
+ wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
324
+ )
325
+ );
326
+
327
+ // Accumulate results with scaling
328
+ float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
329
+ float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
330
+
331
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
332
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
333
+ }
334
+
335
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
336
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
337
+
338
+ #endif
339
+ for (; ib < nb; ++ib) {
340
+ int sumi0 = 0;
341
+ int sumi1 = 0;
342
+
343
+ for (int j = 0; j < qk/2; ++j) {
344
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
345
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
346
+
347
+ sumi0 += (v0 * y[ib].qs[j]);
348
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
349
+ }
350
+
351
+ int sumi = sumi0 + sumi1;
352
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
353
+ }
354
+
355
+ *s = sumf;
356
+ }
357
+
358
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
359
+ const int qk = QK8_0;
360
+ const int nb = n / qk;
361
+
362
+ int ib = 0;
363
+ float sumf = 0;
364
+
365
+ assert(n % qk == 0);
366
+ assert(qk == QK5_0);
367
+ assert(nrc == 1);
368
+ UNUSED(nrc);
369
+ UNUSED(bx);
370
+ UNUSED(by);
371
+ UNUSED(bs);
372
+
373
+ const block_q5_0 * GGML_RESTRICT x = vx;
374
+ const block_q8_0 * GGML_RESTRICT y = vy;
375
+
376
+ #if defined __wasm_simd128__
377
+ v128_t sumv = wasm_f32x4_splat(0.0f);
378
+
379
+ uint32_t qh_;
380
+ uint64_t tmp[4];
381
+
382
+ // TODO: check if unrolling this is better
383
+ for (; ib < nb; ++ib) {
384
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
385
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
386
+
387
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
388
+
389
+ // extract the 5th bit
390
+ memcpy(&qh_, x0->qh, sizeof(qh_));
391
+
392
+ tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF];
393
+ tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF];
394
+ tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
395
+ tmp[3] = table_b2b_1[(qh_ >> 24) ];
396
+
397
+ const v128_t qhl = wasm_v128_load(tmp + 0);
398
+ const v128_t qhh = wasm_v128_load(tmp + 2);
399
+
400
+ const v128_t v0 = wasm_v128_load(x0->qs);
401
+
402
+ // 4-bit -> 8-bit
403
+ const v128_t v0l = wasm_v128_and (v0, m4b);
404
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
405
+
406
+ // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
407
+ const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
408
+ const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
409
+
410
+ // load y
411
+ const v128_t v1l = wasm_v128_load(y0->qs);
412
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
413
+
414
+ // int8x16 -> int16x8
415
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
416
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
417
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
418
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
419
+
420
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
421
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
422
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
423
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
424
+
425
+ // dot product
426
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
427
+ wasm_i32x4_add(
428
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
429
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
430
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
431
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
432
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
433
+ }
434
+
435
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
436
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
437
+
438
+ #endif
439
+ for (; ib < nb; ++ib) {
440
+ uint32_t qh;
441
+ memcpy(&qh, x[ib].qh, sizeof(qh));
442
+
443
+ int sumi0 = 0;
444
+ int sumi1 = 0;
445
+
446
+ for (int j = 0; j < qk/2; ++j) {
447
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
448
+ const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
449
+
450
+ const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
451
+ const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
452
+
453
+ sumi0 += (x0 * y[ib].qs[j]);
454
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
455
+ }
456
+
457
+ int sumi = sumi0 + sumi1;
458
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
459
+ }
460
+
461
+ *s = sumf;
462
+ }
463
+
464
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
465
+ const int qk = QK8_1;
466
+ const int nb = n / qk;
467
+
468
+ int ib = 0;
469
+ float sumf = 0;
470
+
471
+ assert(n % qk == 0);
472
+ assert(qk == QK5_1);
473
+ assert(nrc == 1);
474
+ UNUSED(nrc);
475
+ UNUSED(bx);
476
+ UNUSED(by);
477
+ UNUSED(bs);
478
+
479
+ const block_q5_1 * GGML_RESTRICT x = vx;
480
+ const block_q8_1 * GGML_RESTRICT y = vy;
481
+
482
+ #if defined __wasm_simd128__
483
+ v128_t sumv = wasm_f32x4_splat(0.0f);
484
+
485
+ float summs = 0.0f;
486
+
487
+ uint32_t qh_;
488
+ uint64_t tmp[4];
489
+
490
+ // TODO: check if unrolling this is better
491
+ for (; ib < nb; ++ib) {
492
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
493
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
494
+
495
+ summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
496
+
497
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
498
+
499
+ // extract the 5th bit
500
+ memcpy(&qh_, x0->qh, sizeof(qh_));
501
+
502
+ tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF];
503
+ tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF];
504
+ tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
505
+ tmp[3] = table_b2b_0[(qh_ >> 24) ];
506
+
507
+ const v128_t qhl = wasm_v128_load(tmp + 0);
508
+ const v128_t qhh = wasm_v128_load(tmp + 2);
509
+
510
+ const v128_t v0 = wasm_v128_load(x0->qs);
511
+
512
+ // 4-bit -> 8-bit
513
+ const v128_t v0l = wasm_v128_and (v0, m4b);
514
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
515
+
516
+ // add high bit
517
+ const v128_t v0lf = wasm_v128_or(v0l, qhl);
518
+ const v128_t v0hf = wasm_v128_or(v0h, qhh);
519
+
520
+ // load y
521
+ const v128_t v1l = wasm_v128_load(y0->qs);
522
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
523
+
524
+ // int8x16 -> int16x8
525
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
526
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
527
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
528
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
529
+
530
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
531
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
532
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
533
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
534
+
535
+ // dot product
536
+ sumv = wasm_f32x4_add(sumv,
537
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
538
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
539
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
540
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
541
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
542
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
543
+ }
544
+
545
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
546
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
547
+
548
+ #endif
549
+ for (; ib < nb; ++ib) {
550
+ uint32_t qh;
551
+ memcpy(&qh, x[ib].qh, sizeof(qh));
552
+
553
+ int sumi0 = 0;
554
+ int sumi1 = 0;
555
+
556
+ for (int j = 0; j < qk/2; ++j) {
557
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
558
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
559
+
560
+ const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
561
+ const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
562
+
563
+ sumi0 += (x0 * y[ib].qs[j]);
564
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
565
+ }
566
+
567
+ int sumi = sumi0 + sumi1;
568
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
569
+ }
570
+
571
+ *s = sumf;
572
+ }
573
+
574
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
575
+ const int qk = QK8_0;
576
+ const int nb = n / qk;
577
+
578
+ assert(n % qk == 0);
579
+ assert(nrc == 1);
580
+ UNUSED(nrc);
581
+ UNUSED(bx);
582
+ UNUSED(by);
583
+ UNUSED(bs);
584
+
585
+ const block_q8_0 * GGML_RESTRICT x = vx;
586
+ const block_q8_0 * GGML_RESTRICT y = vy;
587
+
588
+ int ib = 0;
589
+ float sumf = 0;
590
+
591
+ #if defined __wasm_simd128__
592
+ v128_t sumv = wasm_f32x4_splat(0.0f);
593
+
594
+ for (; ib < nb; ++ib) {
595
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
596
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
597
+
598
+ const v128_t x0_0 = wasm_v128_load(x0->qs);
599
+ const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
600
+ const v128_t y0_0 = wasm_v128_load(y0->qs);
601
+ const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
602
+
603
+ // Extend 8-bit to 16-bit
604
+ const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
605
+ const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
606
+ const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
607
+ const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
608
+
609
+ const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
610
+ const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
611
+ const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
612
+ const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
613
+
614
+ // Compute dot products
615
+ const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
616
+ const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
617
+ const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
618
+ const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
619
+
620
+ // Sum all dot products
621
+ const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
622
+
623
+ // Convert to float and accumulate
624
+ const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
625
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
626
+ }
627
+
628
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
629
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
630
+
631
+ #endif
632
+ for (; ib < nb; ++ib) {
633
+ int sumi = 0;
634
+
635
+ for (int j = 0; j < qk; j++) {
636
+ sumi += x[ib].qs[j]*y[ib].qs[j];
637
+ }
638
+
639
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
640
+ }
641
+
642
+ *s = sumf;
643
+ }
644
+
645
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
646
+ assert(nrc == 1);
647
+ UNUSED(nrc);
648
+ UNUSED(bx);
649
+ UNUSED(by);
650
+ UNUSED(bs);
651
+
652
+ const block_q2_K * GGML_RESTRICT x = vx;
653
+ const block_q8_K * GGML_RESTRICT y = vy;
654
+
655
+ const int nb = n / QK_K;
656
+
657
+ #if defined __wasm_simd128__
658
+ float sumf = 0;
659
+
660
+ for (int i = 0; i < nb; ++i) {
661
+ const uint8_t * q2 = x[i].qs;
662
+ const int8_t * q8 = y[i].qs;
663
+ const uint8_t * sc = x[i].scales;
664
+
665
+ // Vectorized summs calculation
666
+ v128_t summs_vec = wasm_i32x4_splat(0);
667
+ {
668
+ v128_t sc_vec = wasm_v128_load(sc);
669
+ v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
670
+
671
+ v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
672
+ v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
673
+
674
+ v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
675
+ v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
676
+
677
+ summs_vec = wasm_i32x4_add(
678
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
679
+ wasm_i32x4_dot_i16x8(sc_high, bsums2)),
680
+ summs_vec
681
+ );
682
+
683
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
684
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
685
+ }
686
+ int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
687
+
688
+ // Vectorized isum calculation
689
+ int32_t isum = 0;
690
+ const uint8_t * sc_ptr = sc;
691
+ const int k_iters = QK_K/128;
692
+
693
+ for (int k = 0; k < k_iters; ++k) {
694
+ v128_t isum_vec = wasm_i32x4_splat(0);
695
+ int shift = 0;
696
+
697
+ for (int j = 0; j < 4; ++j) {
698
+ const int d0 = (sc_ptr[0] & 0xF);
699
+ const int d1 = (sc_ptr[1] & 0xF);
700
+ sc_ptr += 2;
701
+
702
+ // Process first 16 elements
703
+ v128_t q2_0 = wasm_v128_load(q2);
704
+ v128_t q8_0 = wasm_v128_load(q8);
705
+ v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
706
+ v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
707
+
708
+ // Process next 16 elements
709
+ v128_t q2_1 = wasm_v128_load(q2 + 16);
710
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
711
+ v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
712
+ v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
713
+
714
+ // Calculate dot products
715
+ v128_t p0 = wasm_i32x4_dot_i16x8(
716
+ wasm_i16x8_extend_low_i8x16(q8_0),
717
+ wasm_i16x8_extend_low_i8x16(q2_bits_0)
718
+ );
719
+ v128_t p1 = wasm_i32x4_dot_i16x8(
720
+ wasm_i16x8_extend_high_i8x16(q8_0),
721
+ wasm_i16x8_extend_high_i8x16(q2_bits_0)
722
+ );
723
+ v128_t p2 = wasm_i32x4_dot_i16x8(
724
+ wasm_i16x8_extend_low_i8x16(q8_1),
725
+ wasm_i16x8_extend_low_i8x16(q2_bits_1)
726
+ );
727
+ v128_t p3 = wasm_i32x4_dot_i16x8(
728
+ wasm_i16x8_extend_high_i8x16(q8_1),
729
+ wasm_i16x8_extend_high_i8x16(q2_bits_1)
730
+ );
731
+
732
+ // Accumulate scaled results
733
+ v128_t scaled = wasm_i32x4_add(
734
+ wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
735
+ wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
736
+ );
737
+
738
+ isum_vec = wasm_i32x4_add(isum_vec, scaled);
739
+ q8 += 32;
740
+ shift += 2;
741
+ }
742
+ q2 += 32;
743
+
744
+ // Horizontal sum of isum_vec
745
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
746
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
747
+ isum += wasm_i32x4_extract_lane(isum_vec, 0);
748
+ }
749
+
750
+ const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
751
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
752
+ sumf += dall * isum - dmin * summs;
753
+ }
754
+
755
+ *s = sumf;
756
+
757
+ #else
758
+
759
+ float sumf = 0;
760
+
761
+ for (int i = 0; i < nb; ++i) {
762
+
763
+ const uint8_t * q2 = x[i].qs;
764
+ const int8_t * q8 = y[i].qs;
765
+ const uint8_t * sc = x[i].scales;
766
+
767
+ int summs = 0;
768
+ for (int j = 0; j < 16; ++j) {
769
+ summs += y[i].bsums[j] * (sc[j] >> 4);
770
+ }
771
+
772
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
773
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
774
+
775
+ int isum = 0;
776
+ int is = 0;
777
+ int d;
778
+ for (int k = 0; k < QK_K/128; ++k) {
779
+ int shift = 0;
780
+ for (int j = 0; j < 4; ++j) {
781
+ d = sc[is++] & 0xF;
782
+ int isuml = 0;
783
+ for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
784
+ isum += d * isuml;
785
+ d = sc[is++] & 0xF;
786
+ isuml = 0;
787
+ for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
788
+ isum += d * isuml;
789
+ shift += 2;
790
+ q8 += 32;
791
+ }
792
+ q2 += 32;
793
+ }
794
+ sumf += dall * isum - dmin * summs;
795
+ }
796
+ *s = sumf;
797
+ #endif
798
+ }
799
+
800
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
801
+ assert(n % QK_K == 0);
802
+ assert(nrc == 1);
803
+ UNUSED(nrc);
804
+ UNUSED(bx);
805
+ UNUSED(by);
806
+ UNUSED(bs);
807
+
808
+ const uint32_t kmask1 = 0x03030303;
809
+ const uint32_t kmask2 = 0x0f0f0f0f;
810
+
811
+ const block_q3_K * GGML_RESTRICT x = vx;
812
+ const block_q8_K * GGML_RESTRICT y = vy;
813
+
814
+ const int nb = n / QK_K;
815
+
816
+ #if defined __wasm_simd128__
817
+ int8_t aux8[QK_K];
818
+ float sums[8] = {0};
819
+ uint32_t auxs[4];
820
+
821
+ float sumf = 0;
822
+ for (int i = 0; i < nb; ++i) {
823
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
824
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
825
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
826
+
827
+ // Process blocks with SIMD
828
+ int8_t * a = aux8;
829
+ uint8_t m = 1;
830
+ for (int j = 0; j < QK_K; j += 128) {
831
+ for (int shift = 0; shift <= 6; shift += 2) {
832
+ v128_t v_m = wasm_i8x16_splat(m);
833
+ for (int l = 0; l < 32; l += 16) {
834
+ v128_t v_q3 = wasm_v128_load(q3 + l);
835
+ v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
836
+ v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
837
+
838
+ v128_t v_hm = wasm_v128_load(hm + l);
839
+ v128_t v_mask = wasm_v128_and(v_hm, v_m);
840
+ v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
841
+
842
+ v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
843
+ wasm_v128_store(a + l, v_low2);
844
+ }
845
+ a += 32;
846
+ m <<= 1;
847
+ }
848
+ q3 += 32;
849
+ }
850
+
851
+ // Extract scales
852
+ memcpy(auxs, x[i].scales, 12);
853
+ uint32_t tmp = auxs[2];
854
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
855
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
856
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
857
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
858
+ const int8_t * scales = (const int8_t *)auxs;
859
+
860
+ // SIMD dot product with register accumulators
861
+ v128_t v_acc0 = wasm_i32x4_splat(0);
862
+ v128_t v_acc1 = wasm_i32x4_splat(0);
863
+ a = aux8;
864
+ for (int j = 0; j < QK_K/16; ++j) {
865
+ const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
866
+
867
+ // Process 16 elements per iteration
868
+ for (int k = 0; k < 2; ++k) {
869
+ const v128_t v_q8 = wasm_i16x8_load8x8(q8);
870
+ const v128_t v_a = wasm_i16x8_load8x8(a);
871
+
872
+ v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
873
+ v_prod = wasm_i16x8_mul(v_prod, v_scale);
874
+
875
+ v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
876
+ v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
877
+
878
+ q8 += 8;
879
+ a += 8;
880
+ }
881
+ }
882
+
883
+ // Accumulate results
884
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
885
+ const v128_t v_d = wasm_f32x4_splat(d);
886
+ v128_t v_sum = wasm_f32x4_add(
887
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
888
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
889
+ );
890
+
891
+ // Accumulate into sums vector
892
+ wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
893
+ }
894
+
895
+ // Horizontal sum
896
+ v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
897
+ sumf = wasm_f32x4_extract_lane(v_sum, 0) +
898
+ wasm_f32x4_extract_lane(v_sum, 1) +
899
+ wasm_f32x4_extract_lane(v_sum, 2) +
900
+ wasm_f32x4_extract_lane(v_sum, 3);
901
+
902
+ *s = sumf;
903
+
904
+ #else
905
+ // scalar version
906
+ // This function is written like this so the compiler can manage to vectorize most of it
907
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
908
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
909
+ // The ideal situation would be if we could just write the code once, and the compiler would
910
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
911
+ // write vectorized versions for AVX, ARM_NEON, etc.
912
+
913
+ int8_t aux8[QK_K];
914
+ int16_t aux16[8];
915
+ float sums [8];
916
+ int32_t aux32[8];
917
+ memset(sums, 0, 8*sizeof(float));
918
+
919
+ uint32_t auxs[4];
920
+ const int8_t * scales = (const int8_t*)auxs;
921
+
922
+ float sumf = 0;
923
+ for (int i = 0; i < nb; ++i) {
924
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
925
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
926
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
927
+ memset(aux32, 0, 8*sizeof(int32_t));
928
+ int8_t * GGML_RESTRICT a = aux8;
929
+ uint8_t m = 1;
930
+ for (int j = 0; j < QK_K; j += 128) {
931
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
932
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
933
+ a += 32; m <<= 1;
934
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
935
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
936
+ a += 32; m <<= 1;
937
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
938
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
939
+ a += 32; m <<= 1;
940
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
941
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
942
+ a += 32; m <<= 1;
943
+ q3 += 32;
944
+ }
945
+ a = aux8;
946
+
947
+ memcpy(auxs, x[i].scales, 12);
948
+ uint32_t tmp = auxs[2];
949
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
950
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
951
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
952
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
953
+ for (int j = 0; j < QK_K/16; ++j) {
954
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
955
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
956
+ q8 += 8; a += 8;
957
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
958
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
959
+ q8 += 8; a += 8;
960
+ }
961
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
962
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
963
+ }
964
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
965
+ *s = sumf;
966
+
967
+ #endif
968
+
969
+ }
970
+
971
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
972
+ assert(n % QK_K == 0);
973
+ assert(nrc == 1);
974
+ UNUSED(nrc);
975
+ UNUSED(bx);
976
+ UNUSED(by);
977
+ UNUSED(bs);
978
+
979
+ const block_q4_K * GGML_RESTRICT x = vx;
980
+ const block_q8_K * GGML_RESTRICT y = vy;
981
+
982
+ const int nb = n / QK_K;
983
+
984
+ static const uint32_t kmask1 = 0x3f3f3f3f;
985
+ static const uint32_t kmask2 = 0x0f0f0f0f;
986
+ static const uint32_t kmask3 = 0x03030303;
987
+
988
+ uint32_t utmp[4];
989
+
990
+ #if defined __wasm_simd128__
991
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
992
+ float sumf = 0;
993
+
994
+ for (int i = 0; i < nb; ++i) {
995
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
996
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
997
+
998
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
999
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1000
+
1001
+ // Process scales and mins
1002
+ memcpy(utmp, x[i].scales, 12);
1003
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1004
+ const uint32_t uaux = utmp[1] & kmask1;
1005
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1006
+ utmp[2] = uaux;
1007
+ utmp[0] &= kmask1;
1008
+
1009
+ // Sum mins * q8sums
1010
+ int32_t sumi = 0;
1011
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
1012
+ const uint8_t * m = (const uint8_t *)&utmp[2];
1013
+ for (int j = 0; j < 16; j += 2) {
1014
+ sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
1015
+ }
1016
+ sumf -= dmin * sumi;
1017
+
1018
+ int32_t sumi1 = 0;
1019
+ int32_t sumi2 = 0;
1020
+
1021
+ for (int j = 0; j < QK_K/64; ++j) {
1022
+ // Load 64 4-bit weights (32 bytes)
1023
+ const v128_t q4x0 = wasm_v128_load(q4);
1024
+ const v128_t q4x1 = wasm_v128_load(q4 + 16);
1025
+ q4 += 32;
1026
+
1027
+ // Split into low/high nibbles
1028
+ const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
1029
+ const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
1030
+ const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
1031
+ const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
1032
+
1033
+ // Load 64 8-bit values (64 bytes)
1034
+ const v128_t q8x0 = wasm_v128_load(q8);
1035
+ const v128_t q8x1 = wasm_v128_load(q8 + 16);
1036
+ const v128_t q8x2 = wasm_v128_load(q8 + 32);
1037
+ const v128_t q8x3 = wasm_v128_load(q8 + 48);
1038
+ q8 += 64;
1039
+
1040
+ // Low nibble products
1041
+ v128_t vacc1 = wasm_i32x4_dot_i16x8(
1042
+ wasm_i16x8_extend_low_i8x16(q4l0),
1043
+ wasm_i16x8_extend_low_i8x16(q8x0)
1044
+ );
1045
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
1046
+ wasm_i16x8_extend_high_i8x16(q4l0),
1047
+ wasm_i16x8_extend_high_i8x16(q8x0)
1048
+ ));
1049
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
1050
+ wasm_i16x8_extend_low_i8x16(q4l1),
1051
+ wasm_i16x8_extend_low_i8x16(q8x1)
1052
+ ));
1053
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
1054
+ wasm_i16x8_extend_high_i8x16(q4l1),
1055
+ wasm_i16x8_extend_high_i8x16(q8x1)
1056
+ ));
1057
+
1058
+ // High nibble products
1059
+ v128_t vacc2 = wasm_i32x4_dot_i16x8(
1060
+ wasm_i16x8_extend_low_i8x16(q4h0),
1061
+ wasm_i16x8_extend_low_i8x16(q8x2)
1062
+ );
1063
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
1064
+ wasm_i16x8_extend_high_i8x16(q4h0),
1065
+ wasm_i16x8_extend_high_i8x16(q8x2)
1066
+ ));
1067
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
1068
+ wasm_i16x8_extend_low_i8x16(q4h1),
1069
+ wasm_i16x8_extend_low_i8x16(q8x3)
1070
+ ));
1071
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
1072
+ wasm_i16x8_extend_high_i8x16(q4h1),
1073
+ wasm_i16x8_extend_high_i8x16(q8x3)
1074
+ ));
1075
+
1076
+ // Accumulate scaled results
1077
+ int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
1078
+ wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
1079
+ sumi1 += vacc1_sum * scales[2*j];
1080
+
1081
+ int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
1082
+ wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
1083
+ sumi2 += vacc2_sum * scales[2*j+1];
1084
+ }
1085
+
1086
+ sumf += d * (sumi1 + sumi2);
1087
+ }
1088
+
1089
+ *s = sumf;
1090
+
1091
+ #else
1092
+
1093
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1094
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1095
+
1096
+ int8_t aux8[QK_K];
1097
+ int16_t aux16[8];
1098
+ float sums [8];
1099
+ int32_t aux32[8];
1100
+ memset(sums, 0, 8*sizeof(float));
1101
+
1102
+ float sumf = 0;
1103
+ for (int i = 0; i < nb; ++i) {
1104
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1105
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1106
+ memset(aux32, 0, 8*sizeof(int32_t));
1107
+ int8_t * GGML_RESTRICT a = aux8;
1108
+ for (int j = 0; j < QK_K/64; ++j) {
1109
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1110
+ a += 32;
1111
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1112
+ a += 32; q4 += 32;
1113
+ }
1114
+ memcpy(utmp, x[i].scales, 12);
1115
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1116
+ const uint32_t uaux = utmp[1] & kmask1;
1117
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1118
+ utmp[2] = uaux;
1119
+ utmp[0] &= kmask1;
1120
+
1121
+ int sumi = 0;
1122
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1123
+ a = aux8;
1124
+ int is = 0;
1125
+ for (int j = 0; j < QK_K/32; ++j) {
1126
+ int32_t scale = scales[is++];
1127
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1128
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1129
+ q8 += 8; a += 8;
1130
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1131
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1132
+ q8 += 8; a += 8;
1133
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1134
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1135
+ q8 += 8; a += 8;
1136
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1137
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1138
+ q8 += 8; a += 8;
1139
+ }
1140
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1141
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1142
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1143
+ sumf -= dmin * sumi;
1144
+ }
1145
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1146
+ *s = sumf;
1147
+ #endif
1148
+ }
1149
+
1150
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1151
+ assert(n % QK_K == 0);
1152
+ assert(nrc == 1);
1153
+ UNUSED(nrc);
1154
+ UNUSED(bx);
1155
+ UNUSED(by);
1156
+ UNUSED(bs);
1157
+
1158
+ const block_q5_K * GGML_RESTRICT x = vx;
1159
+ const block_q8_K * GGML_RESTRICT y = vy;
1160
+
1161
+ const int nb = n / QK_K;
1162
+
1163
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1164
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1165
+ static const uint32_t kmask3 = 0x03030303;
1166
+
1167
+ uint32_t utmp[4];
1168
+
1169
+ #if defined __wasm_simd128__
1170
+ //const uint8_t * scales = (const uint8_t*)&utmp[0];
1171
+ float sumf = 0;
1172
+
1173
+ for (int i = 0; i < nb; ++i) {
1174
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1175
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
1176
+
1177
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1178
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1179
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1180
+
1181
+ // Process scales and mins
1182
+ memcpy(utmp, x[i].scales, 12);
1183
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1184
+ const uint32_t uaux = utmp[1] & kmask1;
1185
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1186
+ utmp[2] = uaux;
1187
+ utmp[0] &= kmask1;
1188
+
1189
+ // Sum mins * q8sums
1190
+ int32_t sumi_mins = 0;
1191
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
1192
+ const uint8_t * m = (const uint8_t *)&utmp[2];
1193
+ for (int j = 0; j < 16; j += 2) {
1194
+ sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
1195
+ }
1196
+ sumf -= dmin * sumi_mins; // Correct subtraction
1197
+
1198
+ v128_t qh0 = wasm_v128_load(qh);
1199
+ v128_t qh1 = wasm_v128_load(qh + 16);
1200
+ const uint8_t * sc = (const uint8_t *)utmp;
1201
+
1202
+ int32_t sumi = 0;
1203
+
1204
+ for (int j = 0; j < QK_K/64; ++j) {
1205
+ const int shift = j * 2;
1206
+ v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
1207
+ v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
1208
+
1209
+ v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
1210
+ v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
1211
+ v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
1212
+ v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
1213
+
1214
+ v128_t q5_0 = wasm_v128_load(q5);
1215
+ v128_t q5_1 = wasm_v128_load(q5 + 16);
1216
+ q5 += 32;
1217
+
1218
+ v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
1219
+ v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
1220
+ v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
1221
+ v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
1222
+
1223
+ v128_t q8_0 = wasm_v128_load(q8);
1224
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
1225
+ v128_t q8_2 = wasm_v128_load(q8 + 32);
1226
+ v128_t q8_3 = wasm_v128_load(q8 + 48);
1227
+ q8 += 64;
1228
+
1229
+ // Process low quants
1230
+ v128_t pl0 = wasm_i32x4_dot_i16x8(
1231
+ wasm_i16x8_extend_low_i8x16(q5l_0),
1232
+ wasm_i16x8_extend_low_i8x16(q8_0)
1233
+ );
1234
+ pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
1235
+ wasm_i16x8_extend_high_i8x16(q5l_0),
1236
+ wasm_i16x8_extend_high_i8x16(q8_0)
1237
+ ));
1238
+ v128_t pl1 = wasm_i32x4_dot_i16x8(
1239
+ wasm_i16x8_extend_low_i8x16(q5l_1),
1240
+ wasm_i16x8_extend_low_i8x16(q8_1)
1241
+ );
1242
+ pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
1243
+ wasm_i16x8_extend_high_i8x16(q5l_1),
1244
+ wasm_i16x8_extend_high_i8x16(q8_1)
1245
+ ));
1246
+ v128_t sum_low = wasm_i32x4_add(pl0, pl1);
1247
+
1248
+ // Process high quants
1249
+ v128_t ph0 = wasm_i32x4_dot_i16x8(
1250
+ wasm_i16x8_extend_low_i8x16(q5h_0),
1251
+ wasm_i16x8_extend_low_i8x16(q8_2)
1252
+ );
1253
+ ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
1254
+ wasm_i16x8_extend_high_i8x16(q5h_0),
1255
+ wasm_i16x8_extend_high_i8x16(q8_2)
1256
+ ));
1257
+ v128_t ph1 = wasm_i32x4_dot_i16x8(
1258
+ wasm_i16x8_extend_low_i8x16(q5h_1),
1259
+ wasm_i16x8_extend_low_i8x16(q8_3)
1260
+ );
1261
+ ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
1262
+ wasm_i16x8_extend_high_i8x16(q5h_1),
1263
+ wasm_i16x8_extend_high_i8x16(q8_3)
1264
+ ));
1265
+ v128_t sum_high = wasm_i32x4_add(ph0, ph1);
1266
+
1267
+ // Accumulate with scale factors
1268
+ int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
1269
+ wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
1270
+ int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
1271
+ wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
1272
+
1273
+ sumi += sl * sc[2*j] + sh * sc[2*j+1];
1274
+ }
1275
+
1276
+ sumf += d * sumi;
1277
+ }
1278
+
1279
+ *s = sumf;
1280
+
1281
+ #else
1282
+
1283
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1284
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1285
+
1286
+ int8_t aux8[QK_K];
1287
+ int16_t aux16[8];
1288
+ float sums [8];
1289
+ int32_t aux32[8];
1290
+ memset(sums, 0, 8*sizeof(float));
1291
+
1292
+ float sumf = 0;
1293
+ for (int i = 0; i < nb; ++i) {
1294
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1295
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
1296
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1297
+ memset(aux32, 0, 8*sizeof(int32_t));
1298
+ int8_t * GGML_RESTRICT a = aux8;
1299
+ uint8_t m = 1;
1300
+ for (int j = 0; j < QK_K/64; ++j) {
1301
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1302
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1303
+ a += 32; m <<= 1;
1304
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1305
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1306
+ a += 32; m <<= 1;
1307
+ q4 += 32;
1308
+ }
1309
+ memcpy(utmp, x[i].scales, 12);
1310
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1311
+ const uint32_t uaux = utmp[1] & kmask1;
1312
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1313
+ utmp[2] = uaux;
1314
+ utmp[0] &= kmask1;
1315
+
1316
+ int sumi = 0;
1317
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1318
+ a = aux8;
1319
+ int is = 0;
1320
+ for (int j = 0; j < QK_K/32; ++j) {
1321
+ int32_t scale = scales[is++];
1322
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1323
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1324
+ q8 += 8; a += 8;
1325
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1326
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1327
+ q8 += 8; a += 8;
1328
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1329
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1330
+ q8 += 8; a += 8;
1331
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1332
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1333
+ q8 += 8; a += 8;
1334
+ }
1335
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1336
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1337
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1338
+ sumf -= dmin * sumi;
1339
+ }
1340
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1341
+ *s = sumf;
1342
+ #endif
1343
+ }
1344
+
1345
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1346
+ assert(n % QK_K == 0);
1347
+ assert(nrc == 1);
1348
+ UNUSED(nrc);
1349
+ UNUSED(bx);
1350
+ UNUSED(by);
1351
+ UNUSED(bs);
1352
+
1353
+ const block_q6_K * GGML_RESTRICT x = vx;
1354
+ const block_q8_K * GGML_RESTRICT y = vy;
1355
+
1356
+ const int nb = n / QK_K;
1357
+
1358
+ #if defined __wasm_simd128__
1359
+ int8_t aux8[QK_K] __attribute__((aligned(16)));
1360
+ int32_t aux32[8] __attribute__((aligned(16))) = {0};
1361
+ float sums[8] __attribute__((aligned(16))) = {0};
1362
+
1363
+ for (int i = 0; i < nb; ++i) {
1364
+ // Unpack 6-bit quantized data into aux8 (unchanged)
1365
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1366
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1367
+ int8_t * a = aux8;
1368
+ for (int j = 0; j < QK_K; j += 128) {
1369
+ for (int l = 0; l < 32; ++l) {
1370
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1371
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1372
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1373
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1374
+ }
1375
+ a += 128;
1376
+ q4 += 64;
1377
+ qh += 32;
1378
+ }
1379
+
1380
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
1381
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1382
+ v128_t acc0 = wasm_i32x4_splat(0);
1383
+ v128_t acc1 = wasm_i32x4_splat(0);
1384
+
1385
+ for (int j = 0; j < QK_K/16; ++j) {
1386
+ const int scale = x[i].scales[j];
1387
+ const v128_t vscale = wasm_i32x4_splat(scale);
1388
+
1389
+ // Load 16 elements from a and q8
1390
+ const v128_t a_vec = wasm_v128_load(a_ptr);
1391
+ const v128_t q8_vec = wasm_v128_load(q8);
1392
+
1393
+ // Process low 8 elements
1394
+ v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
1395
+ v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
1396
+ v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
1397
+ v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
1398
+ v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
1399
+
1400
+ // Process high 8 elements
1401
+ v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
1402
+ v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
1403
+ v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
1404
+ v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
1405
+ v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
1406
+
1407
+ // Scale and accumulate
1408
+ prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
1409
+ prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
1410
+ prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
1411
+ prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
1412
+
1413
+ acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
1414
+ acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
1415
+
1416
+ a_ptr += 16;
1417
+ q8 += 16;
1418
+ }
1419
+
1420
+ // Store accumulated results
1421
+ wasm_v128_store(&aux32[0], acc0);
1422
+ wasm_v128_store(&aux32[4], acc1);
1423
+
1424
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1425
+ for (int l = 0; l < 8; ++l) {
1426
+ sums[l] += d * aux32[l];
1427
+ }
1428
+ }
1429
+
1430
+ // Sum final results
1431
+ float sumf = 0;
1432
+ for (int l = 0; l < 8; ++l) {
1433
+ sumf += sums[l];
1434
+ }
1435
+ *s = sumf;
1436
+
1437
+ #else
1438
+
1439
+ int8_t aux8[QK_K];
1440
+ int16_t aux16[8];
1441
+ float sums [8];
1442
+ int32_t aux32[8];
1443
+ memset(sums, 0, 8*sizeof(float));
1444
+
1445
+ float sumf = 0;
1446
+ for (int i = 0; i < nb; ++i) {
1447
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1448
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1449
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1450
+ memset(aux32, 0, 8*sizeof(int32_t));
1451
+ int8_t * GGML_RESTRICT a = aux8;
1452
+ for (int j = 0; j < QK_K; j += 128) {
1453
+ for (int l = 0; l < 32; ++l) {
1454
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1455
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1456
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1457
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1458
+ }
1459
+ a += 128;
1460
+ q4 += 64;
1461
+ qh += 32;
1462
+ }
1463
+ a = aux8;
1464
+ int is = 0;
1465
+ for (int j = 0; j < QK_K/16; ++j) {
1466
+ int scale = x[i].scales[is++];
1467
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1468
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1469
+ q8 += 8; a += 8;
1470
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1471
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1472
+ q8 += 8; a += 8;
1473
+ }
1474
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1475
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1476
+ }
1477
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1478
+ *s = sumf;
1479
+ #endif
1480
+ }
1481
+