whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,1221 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
7
+
8
+ #include "../../quants.h"
9
+ #include "../../ggml-cpu-impl.h"
10
+
11
+ #include <math.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+ #include <float.h>
15
+ #include <stdlib.h> // for qsort
16
+ #include <stdio.h> // for GGML_ASSERT
17
+
18
+ #define GROUP_MAX_EPS 1e-15f
19
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
21
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
22
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
23
+
24
+ #define UNUSED GGML_UNUSED
25
+
26
+ #if defined(__wasm_simd128__)
27
+ #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28
+ #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29
+ #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30
+ #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31
+ #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32
+ #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33
+ #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34
+ #define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
+
36
+ // precomputed tables for expanding 8bits to 8 bytes:
37
+ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
38
+ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
+ #endif
40
+
41
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
42
+ assert(QK8_0 == 32);
43
+ assert(k % QK8_0 == 0);
44
+ const int nb = k / QK8_0;
45
+
46
+ block_q8_0 * GGML_RESTRICT y = vy;
47
+
48
+ #if defined __wasm_simd128__
49
+ for (int i = 0; i < nb; i++) {
50
+ v128_t srcv [8];
51
+ v128_t asrcv[8];
52
+ v128_t amaxv[8];
53
+
54
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
55
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
56
+
57
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
58
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
59
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
60
+
61
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
62
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
63
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
64
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
65
+
66
+ const float d = amax / ((1 << 7) - 1);
67
+ const float id = d ? 1.0f/d : 0.0f;
68
+
69
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
70
+
71
+ for (int j = 0; j < 8; j++) {
72
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
73
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
74
+
75
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
76
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
77
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
78
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
79
+ }
80
+ }
81
+ #else
82
+ GGML_UNUSED(nb);
83
+ // scalar
84
+ quantize_row_q8_0_ref(x, y, k);
85
+ #endif
86
+ }
87
+
88
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
89
+ assert(k % QK8_1 == 0);
90
+ const int nb = k / QK8_1;
91
+
92
+ block_q8_1 * GGML_RESTRICT y = vy;
93
+ #if defined __wasm_simd128__
94
+ for (int i = 0; i < nb; i++) {
95
+ v128_t srcv [8];
96
+ v128_t asrcv[8];
97
+ v128_t amaxv[8];
98
+
99
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
100
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
101
+
102
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
103
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
104
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
105
+
106
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
107
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
108
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
109
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
110
+
111
+ const float d = amax / ((1 << 7) - 1);
112
+ const float id = d ? 1.0f/d : 0.0f;
113
+
114
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
115
+
116
+ v128_t accv = wasm_i32x4_splat(0);
117
+
118
+ for (int j = 0; j < 8; j++) {
119
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
120
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
121
+
122
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
123
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
124
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
125
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
126
+
127
+ accv = wasm_i32x4_add(accv, vi);
128
+ }
129
+
130
+ y[i].s = GGML_CPU_FP32_TO_FP16(
131
+ d * (wasm_i32x4_extract_lane(accv, 0) +
132
+ wasm_i32x4_extract_lane(accv, 1) +
133
+ wasm_i32x4_extract_lane(accv, 2) +
134
+ wasm_i32x4_extract_lane(accv, 3)));
135
+ }
136
+ #else
137
+ GGML_UNUSED(nb);
138
+ // scalar
139
+ quantize_row_q8_1_ref(x, y, k);
140
+ #endif
141
+ }
142
+
143
+ //===================================== Q8_K ==============================================
144
+
145
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
146
+ #ifdef __wasm_simd128__
147
+ assert(k % QK_K == 0);
148
+ const int64_t nb = k / QK_K;
149
+ block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
150
+
151
+ for (int i = 0; i < nb; i++) {
152
+ const float * x_block = x + i * QK_K;
153
+
154
+ v128_t min_vec = wasm_v128_load(x_block);
155
+ v128_t max_vec = min_vec;
156
+
157
+ for (int j = 4; j < QK_K; j += 4) {
158
+ v128_t x_vec = wasm_v128_load(x_block + j);
159
+ max_vec = wasm_f32x4_pmax(max_vec, x_vec);
160
+ min_vec = wasm_f32x4_pmin(min_vec, x_vec);
161
+ }
162
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
163
+ max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
164
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
165
+ min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
166
+ float max = wasm_f32x4_extract_lane(max_vec, 0);
167
+ float min = wasm_f32x4_extract_lane(min_vec, 0);
168
+ float amax = -min > max ? min : max;
169
+
170
+ if (amax == 0.0f) {
171
+ yc[i].d = 0.0f;
172
+ const v128_t zero = wasm_i8x16_splat(0);
173
+ for (int j = 0; j < QK_K; j += 16) {
174
+ wasm_v128_store(yc[i].qs + j, zero);
175
+ }
176
+ continue;
177
+ }
178
+
179
+ const float iscale = -127.0f / amax;
180
+ const v128_t scale_vec = wasm_f32x4_splat(iscale);
181
+
182
+ // Process 16 elements per iteration
183
+ for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
184
+ // Load and quantize 16 floats
185
+ v128_t x0 = wasm_v128_load(x_block + j);
186
+ v128_t x1 = wasm_v128_load(x_block + j + 4);
187
+ v128_t x2 = wasm_v128_load(x_block + j + 8);
188
+ v128_t x3 = wasm_v128_load(x_block + j + 12);
189
+
190
+ v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
191
+ v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
192
+ v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
193
+ v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
194
+
195
+ // Convert to i32 with saturation
196
+ v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
197
+ v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
198
+ v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
199
+ v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
200
+
201
+ // Pack into 16 i8 values
202
+ v128_t i8 = wasm_i8x16_narrow_i16x8(
203
+ wasm_i16x8_narrow_i32x4(i0, i1),
204
+ wasm_i16x8_narrow_i32x4(i2, i3)
205
+ );
206
+ wasm_v128_store(yc[i].qs + j, i8);
207
+
208
+ // Calculate bsums using SIMD
209
+ v128_t sum16 = wasm_i16x8_add(
210
+ wasm_i16x8_extend_low_i8x16(i8),
211
+ wasm_i16x8_extend_high_i8x16(i8)
212
+ );
213
+ v128_t sum32 = wasm_i32x4_add(
214
+ wasm_i32x4_extend_low_i16x8(sum16),
215
+ wasm_i32x4_extend_high_i16x8(sum16)
216
+ );
217
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
218
+ sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
219
+ yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
220
+ }
221
+
222
+ yc[i].d = 1.0f / iscale;
223
+ }
224
+ #else
225
+ quantize_row_q8_K_ref(x, y, k);
226
+ #endif
227
+ }
228
+
229
+
230
+ //===================================== Dot products =================================
231
+
232
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
233
+ const int qk = QK8_0;
234
+ const int nb = n / qk;
235
+
236
+ assert(n % qk == 0);
237
+ assert(nrc == 1);
238
+ UNUSED(nrc);
239
+ UNUSED(bx);
240
+ UNUSED(by);
241
+ UNUSED(bs);
242
+
243
+ const block_q4_0 * GGML_RESTRICT x = vx;
244
+ const block_q8_0 * GGML_RESTRICT y = vy;
245
+
246
+ int ib = 0;
247
+ float sumf = 0;
248
+
249
+ #if defined __wasm_simd128__
250
+ v128_t sumv = wasm_f32x4_splat(0.0f);
251
+
252
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
253
+ const v128_t s8b = wasm_i8x16_splat(0x8);
254
+
255
+ for (; ib + 1 < nb; ib += 2) {
256
+ const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
257
+ const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
258
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
259
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
260
+
261
+ // Load and process x0
262
+ v128_t v0_0 = wasm_v128_load(x0->qs);
263
+ v128_t v0_0l = wasm_v128_and(v0_0, m4b);
264
+ v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
265
+ v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
266
+ v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
267
+
268
+ // Load y0 vectors
269
+ v128_t y0_l = wasm_v128_load(y0->qs);
270
+ v128_t y0_h = wasm_v128_load(y0->qs + 16);
271
+
272
+ // Extend to i16x8 and compute dot products
273
+ v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
274
+ v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
275
+ v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
276
+ v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
277
+
278
+ v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
279
+ v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
280
+ v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
281
+ v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
282
+
283
+ v128_t dp0 = wasm_i32x4_add(
284
+ wasm_i32x4_add(
285
+ wasm_i32x4_dot_i16x8(dx0l, dy0ll),
286
+ wasm_i32x4_dot_i16x8(dx0h, dy0lh)
287
+ ),
288
+ wasm_i32x4_add(
289
+ wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
290
+ wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
291
+ )
292
+ );
293
+
294
+ // Load and process x1
295
+ v128_t v0_1 = wasm_v128_load(x1->qs);
296
+ v128_t v0_1l = wasm_v128_and(v0_1, m4b);
297
+ v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
298
+ v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
299
+ v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
300
+
301
+ // Load y1 vectors
302
+ v128_t y1_l = wasm_v128_load(y1->qs);
303
+ v128_t y1_h = wasm_v128_load(y1->qs + 16);
304
+
305
+ // Extend to i16x8 and compute dot products
306
+ v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
307
+ v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
308
+ v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
309
+ v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
310
+
311
+ v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
312
+ v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
313
+ v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
314
+ v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
315
+
316
+ v128_t dp1 = wasm_i32x4_add(
317
+ wasm_i32x4_add(
318
+ wasm_i32x4_dot_i16x8(dx1l, dy1ll),
319
+ wasm_i32x4_dot_i16x8(dx1h, dy1lh)
320
+ ),
321
+ wasm_i32x4_add(
322
+ wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
323
+ wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
324
+ )
325
+ );
326
+
327
+ // Accumulate results with scaling
328
+ float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
329
+ float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
330
+
331
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
332
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
333
+ }
334
+
335
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
336
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
337
+
338
+ #endif
339
+ for (; ib < nb; ++ib) {
340
+ int sumi0 = 0;
341
+ int sumi1 = 0;
342
+
343
+ for (int j = 0; j < qk/2; ++j) {
344
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
345
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
346
+
347
+ sumi0 += (v0 * y[ib].qs[j]);
348
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
349
+ }
350
+
351
+ int sumi = sumi0 + sumi1;
352
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
353
+ }
354
+
355
+ *s = sumf;
356
+ }
357
+
358
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
359
+ const int qk = QK8_0;
360
+ const int nb = n / qk;
361
+
362
+ int ib = 0;
363
+ float sumf = 0;
364
+
365
+ assert(n % qk == 0);
366
+ assert(qk == QK5_0);
367
+ assert(nrc == 1);
368
+ UNUSED(nrc);
369
+ UNUSED(bx);
370
+ UNUSED(by);
371
+ UNUSED(bs);
372
+
373
+ const block_q5_0 * GGML_RESTRICT x = vx;
374
+ const block_q8_0 * GGML_RESTRICT y = vy;
375
+
376
+ #if defined __wasm_simd128__
377
+ v128_t sumv = wasm_f32x4_splat(0.0f);
378
+
379
+ uint32_t qh_;
380
+ uint64_t tmp[4];
381
+
382
+ // TODO: check if unrolling this is better
383
+ for (; ib < nb; ++ib) {
384
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
385
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
386
+
387
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
388
+
389
+ // extract the 5th bit
390
+ memcpy(&qh_, x0->qh, sizeof(qh_));
391
+
392
+ tmp[0] = table_b2b_1[(qh_ >> 0) & 0xFF];
393
+ tmp[1] = table_b2b_1[(qh_ >> 8) & 0xFF];
394
+ tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
395
+ tmp[3] = table_b2b_1[(qh_ >> 24) ];
396
+
397
+ const v128_t qhl = wasm_v128_load(tmp + 0);
398
+ const v128_t qhh = wasm_v128_load(tmp + 2);
399
+
400
+ const v128_t v0 = wasm_v128_load(x0->qs);
401
+
402
+ // 4-bit -> 8-bit
403
+ const v128_t v0l = wasm_v128_and (v0, m4b);
404
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
405
+
406
+ // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
407
+ const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
408
+ const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
409
+
410
+ // load y
411
+ const v128_t v1l = wasm_v128_load(y0->qs);
412
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
413
+
414
+ // int8x16 -> int16x8
415
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
416
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
417
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
418
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
419
+
420
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
421
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
422
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
423
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
424
+
425
+ // dot product
426
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
427
+ wasm_i32x4_add(
428
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
429
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
430
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
431
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
432
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
433
+ }
434
+
435
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
436
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
437
+
438
+ *s = sumf;
439
+ #else
440
+ UNUSED(nb);
441
+ UNUSED(ib);
442
+ UNUSED(sumf);
443
+ UNUSED(x);
444
+ UNUSED(y);
445
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
446
+ #endif
447
+ }
448
+
449
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
450
+ const int qk = QK8_1;
451
+ const int nb = n / qk;
452
+
453
+ int ib = 0;
454
+ float sumf = 0;
455
+
456
+ assert(n % qk == 0);
457
+ assert(qk == QK5_1);
458
+ assert(nrc == 1);
459
+ UNUSED(nrc);
460
+ UNUSED(bx);
461
+ UNUSED(by);
462
+ UNUSED(bs);
463
+
464
+ const block_q5_1 * GGML_RESTRICT x = vx;
465
+ const block_q8_1 * GGML_RESTRICT y = vy;
466
+
467
+ #if defined __wasm_simd128__
468
+ v128_t sumv = wasm_f32x4_splat(0.0f);
469
+
470
+ float summs = 0.0f;
471
+
472
+ uint32_t qh_;
473
+ uint64_t tmp[4];
474
+
475
+ // TODO: check if unrolling this is better
476
+ for (; ib < nb; ++ib) {
477
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
478
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
479
+
480
+ summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
481
+
482
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
483
+
484
+ // extract the 5th bit
485
+ memcpy(&qh_, x0->qh, sizeof(qh_));
486
+
487
+ tmp[0] = table_b2b_0[(qh_ >> 0) & 0xFF];
488
+ tmp[1] = table_b2b_0[(qh_ >> 8) & 0xFF];
489
+ tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
490
+ tmp[3] = table_b2b_0[(qh_ >> 24) ];
491
+
492
+ const v128_t qhl = wasm_v128_load(tmp + 0);
493
+ const v128_t qhh = wasm_v128_load(tmp + 2);
494
+
495
+ const v128_t v0 = wasm_v128_load(x0->qs);
496
+
497
+ // 4-bit -> 8-bit
498
+ const v128_t v0l = wasm_v128_and (v0, m4b);
499
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
500
+
501
+ // add high bit
502
+ const v128_t v0lf = wasm_v128_or(v0l, qhl);
503
+ const v128_t v0hf = wasm_v128_or(v0h, qhh);
504
+
505
+ // load y
506
+ const v128_t v1l = wasm_v128_load(y0->qs);
507
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
508
+
509
+ // int8x16 -> int16x8
510
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
511
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
512
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
513
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
514
+
515
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
516
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
517
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
518
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
519
+
520
+ // dot product
521
+ sumv = wasm_f32x4_add(sumv,
522
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
523
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
524
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
525
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
526
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
527
+ wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
528
+ }
529
+
530
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
531
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
532
+
533
+ *s = sumf;
534
+ #else
535
+ UNUSED(nb);
536
+ UNUSED(ib);
537
+ UNUSED(sumf);
538
+ UNUSED(x);
539
+ UNUSED(y);
540
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
541
+ #endif
542
+ }
543
+
544
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
545
+ const int qk = QK8_0;
546
+ const int nb = n / qk;
547
+
548
+ assert(n % qk == 0);
549
+ assert(nrc == 1);
550
+ UNUSED(nrc);
551
+ UNUSED(bx);
552
+ UNUSED(by);
553
+ UNUSED(bs);
554
+
555
+ const block_q8_0 * GGML_RESTRICT x = vx;
556
+ const block_q8_0 * GGML_RESTRICT y = vy;
557
+
558
+ int ib = 0;
559
+ float sumf = 0;
560
+
561
+ #if defined __wasm_simd128__
562
+ v128_t sumv = wasm_f32x4_splat(0.0f);
563
+
564
+ for (; ib < nb; ++ib) {
565
+ const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
566
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
567
+
568
+ const v128_t x0_0 = wasm_v128_load(x0->qs);
569
+ const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
570
+ const v128_t y0_0 = wasm_v128_load(y0->qs);
571
+ const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
572
+
573
+ // Extend 8-bit to 16-bit
574
+ const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
575
+ const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
576
+ const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
577
+ const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
578
+
579
+ const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
580
+ const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
581
+ const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
582
+ const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
583
+
584
+ // Compute dot products
585
+ const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
586
+ const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
587
+ const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
588
+ const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
589
+
590
+ // Sum all dot products
591
+ const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
592
+
593
+ // Convert to float and accumulate
594
+ const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
595
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
596
+ }
597
+
598
+ sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
599
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
600
+
601
+ *s = sumf;
602
+ #else
603
+ UNUSED(nb);
604
+ UNUSED(x);
605
+ UNUSED(y);
606
+ UNUSED(ib);
607
+ UNUSED(sumf);
608
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
609
+ #endif
610
+ }
611
+
612
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
613
+ assert(nrc == 1);
614
+ UNUSED(nrc);
615
+ UNUSED(bx);
616
+ UNUSED(by);
617
+ UNUSED(bs);
618
+
619
+ const block_q2_K * GGML_RESTRICT x = vx;
620
+ const block_q8_K * GGML_RESTRICT y = vy;
621
+
622
+ const int nb = n / QK_K;
623
+
624
+ #if defined __wasm_simd128__
625
+ float sumf = 0;
626
+
627
+ for (int i = 0; i < nb; ++i) {
628
+ const uint8_t * q2 = x[i].qs;
629
+ const int8_t * q8 = y[i].qs;
630
+ const uint8_t * sc = x[i].scales;
631
+
632
+ // Vectorized summs calculation
633
+ v128_t summs_vec = wasm_i32x4_splat(0);
634
+ {
635
+ v128_t sc_vec = wasm_v128_load(sc);
636
+ v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
637
+
638
+ v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
639
+ v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
640
+
641
+ v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
642
+ v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
643
+
644
+ summs_vec = wasm_i32x4_add(
645
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
646
+ wasm_i32x4_dot_i16x8(sc_high, bsums2)),
647
+ summs_vec
648
+ );
649
+
650
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
651
+ summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
652
+ }
653
+ int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
654
+
655
+ // Vectorized isum calculation
656
+ int32_t isum = 0;
657
+ const uint8_t * sc_ptr = sc;
658
+ const int k_iters = QK_K/128;
659
+
660
+ for (int k = 0; k < k_iters; ++k) {
661
+ v128_t isum_vec = wasm_i32x4_splat(0);
662
+ int shift = 0;
663
+
664
+ for (int j = 0; j < 4; ++j) {
665
+ const int d0 = (sc_ptr[0] & 0xF);
666
+ const int d1 = (sc_ptr[1] & 0xF);
667
+ sc_ptr += 2;
668
+
669
+ // Process first 16 elements
670
+ v128_t q2_0 = wasm_v128_load(q2);
671
+ v128_t q8_0 = wasm_v128_load(q8);
672
+ v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
673
+ v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
674
+
675
+ // Process next 16 elements
676
+ v128_t q2_1 = wasm_v128_load(q2 + 16);
677
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
678
+ v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
679
+ v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
680
+
681
+ // Calculate dot products
682
+ v128_t p0 = wasm_i32x4_dot_i16x8(
683
+ wasm_i16x8_extend_low_i8x16(q8_0),
684
+ wasm_i16x8_extend_low_i8x16(q2_bits_0)
685
+ );
686
+ v128_t p1 = wasm_i32x4_dot_i16x8(
687
+ wasm_i16x8_extend_high_i8x16(q8_0),
688
+ wasm_i16x8_extend_high_i8x16(q2_bits_0)
689
+ );
690
+ v128_t p2 = wasm_i32x4_dot_i16x8(
691
+ wasm_i16x8_extend_low_i8x16(q8_1),
692
+ wasm_i16x8_extend_low_i8x16(q2_bits_1)
693
+ );
694
+ v128_t p3 = wasm_i32x4_dot_i16x8(
695
+ wasm_i16x8_extend_high_i8x16(q8_1),
696
+ wasm_i16x8_extend_high_i8x16(q2_bits_1)
697
+ );
698
+
699
+ // Accumulate scaled results
700
+ v128_t scaled = wasm_i32x4_add(
701
+ wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
702
+ wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
703
+ );
704
+
705
+ isum_vec = wasm_i32x4_add(isum_vec, scaled);
706
+ q8 += 32;
707
+ shift += 2;
708
+ }
709
+ q2 += 32;
710
+
711
+ // Horizontal sum of isum_vec
712
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
713
+ isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
714
+ isum += wasm_i32x4_extract_lane(isum_vec, 0);
715
+ }
716
+
717
+ const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
718
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
719
+ sumf += dall * isum - dmin * summs;
720
+ }
721
+
722
+ *s = sumf;
723
+
724
+ #else
725
+ UNUSED(x);
726
+ UNUSED(y);
727
+ UNUSED(nb);
728
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
729
+ #endif
730
+ }
731
+
732
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
733
+ assert(n % QK_K == 0);
734
+ assert(nrc == 1);
735
+ UNUSED(nrc);
736
+ UNUSED(bx);
737
+ UNUSED(by);
738
+ UNUSED(bs);
739
+
740
+ const uint32_t kmask1 = 0x03030303;
741
+ const uint32_t kmask2 = 0x0f0f0f0f;
742
+
743
+ const block_q3_K * GGML_RESTRICT x = vx;
744
+ const block_q8_K * GGML_RESTRICT y = vy;
745
+
746
+ const int nb = n / QK_K;
747
+
748
+ #if defined __wasm_simd128__
749
+ int8_t aux8[QK_K];
750
+ float sums[8] = {0};
751
+ uint32_t auxs[4];
752
+
753
+ float sumf = 0;
754
+ for (int i = 0; i < nb; ++i) {
755
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
756
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
757
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
758
+
759
+ // Process blocks with SIMD
760
+ int8_t * a = aux8;
761
+ uint8_t m = 1;
762
+ for (int j = 0; j < QK_K; j += 128) {
763
+ for (int shift = 0; shift <= 6; shift += 2) {
764
+ v128_t v_m = wasm_i8x16_splat(m);
765
+ for (int l = 0; l < 32; l += 16) {
766
+ v128_t v_q3 = wasm_v128_load(q3 + l);
767
+ v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
768
+ v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
769
+
770
+ v128_t v_hm = wasm_v128_load(hm + l);
771
+ v128_t v_mask = wasm_v128_and(v_hm, v_m);
772
+ v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
773
+
774
+ v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
775
+ wasm_v128_store(a + l, v_low2);
776
+ }
777
+ a += 32;
778
+ m <<= 1;
779
+ }
780
+ q3 += 32;
781
+ }
782
+
783
+ // Extract scales
784
+ memcpy(auxs, x[i].scales, 12);
785
+ uint32_t tmp = auxs[2];
786
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
787
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
788
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
789
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
790
+ const int8_t * scales = (const int8_t *)auxs;
791
+
792
+ // SIMD dot product with register accumulators
793
+ v128_t v_acc0 = wasm_i32x4_splat(0);
794
+ v128_t v_acc1 = wasm_i32x4_splat(0);
795
+ a = aux8;
796
+ for (int j = 0; j < QK_K/16; ++j) {
797
+ const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
798
+
799
+ // Process 16 elements per iteration
800
+ for (int k = 0; k < 2; ++k) {
801
+ const v128_t v_q8 = wasm_i16x8_load8x8(q8);
802
+ const v128_t v_a = wasm_i16x8_load8x8(a);
803
+
804
+ v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
805
+ v_prod = wasm_i16x8_mul(v_prod, v_scale);
806
+
807
+ v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
808
+ v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
809
+
810
+ q8 += 8;
811
+ a += 8;
812
+ }
813
+ }
814
+
815
+ // Accumulate results
816
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
817
+ const v128_t v_d = wasm_f32x4_splat(d);
818
+ v128_t v_sum = wasm_f32x4_add(
819
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
820
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
821
+ );
822
+
823
+ // Accumulate into sums vector
824
+ wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
825
+ }
826
+
827
+ // Horizontal sum
828
+ v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
829
+ sumf = wasm_f32x4_extract_lane(v_sum, 0) +
830
+ wasm_f32x4_extract_lane(v_sum, 1) +
831
+ wasm_f32x4_extract_lane(v_sum, 2) +
832
+ wasm_f32x4_extract_lane(v_sum, 3);
833
+
834
+ *s = sumf;
835
+
836
+ #else
837
+ UNUSED(kmask1);
838
+ UNUSED(kmask2);
839
+ UNUSED(x);
840
+ UNUSED(y);
841
+ UNUSED(nb);
842
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
843
+ #endif
844
+
845
+ }
846
+
847
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
848
+ assert(n % QK_K == 0);
849
+ assert(nrc == 1);
850
+ UNUSED(nrc);
851
+ UNUSED(bx);
852
+ UNUSED(by);
853
+ UNUSED(bs);
854
+
855
+ const block_q4_K * GGML_RESTRICT x = vx;
856
+ const block_q8_K * GGML_RESTRICT y = vy;
857
+
858
+ const int nb = n / QK_K;
859
+
860
+ static const uint32_t kmask1 = 0x3f3f3f3f;
861
+ static const uint32_t kmask2 = 0x0f0f0f0f;
862
+ static const uint32_t kmask3 = 0x03030303;
863
+
864
+ uint32_t utmp[4];
865
+
866
+ #if defined __wasm_simd128__
867
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
868
+ float sumf = 0;
869
+
870
+ for (int i = 0; i < nb; ++i) {
871
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
872
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
873
+
874
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
875
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
876
+
877
+ // Process scales and mins
878
+ memcpy(utmp, x[i].scales, 12);
879
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
880
+ const uint32_t uaux = utmp[1] & kmask1;
881
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
882
+ utmp[2] = uaux;
883
+ utmp[0] &= kmask1;
884
+
885
+ // Sum mins * q8sums
886
+ int32_t sumi = 0;
887
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
888
+ const uint8_t * m = (const uint8_t *)&utmp[2];
889
+ for (int j = 0; j < 16; j += 2) {
890
+ sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
891
+ }
892
+ sumf -= dmin * sumi;
893
+
894
+ int32_t sumi1 = 0;
895
+ int32_t sumi2 = 0;
896
+
897
+ for (int j = 0; j < QK_K/64; ++j) {
898
+ // Load 64 4-bit weights (32 bytes)
899
+ const v128_t q4x0 = wasm_v128_load(q4);
900
+ const v128_t q4x1 = wasm_v128_load(q4 + 16);
901
+ q4 += 32;
902
+
903
+ // Split into low/high nibbles
904
+ const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
905
+ const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
906
+ const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
907
+ const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
908
+
909
+ // Load 64 8-bit values (64 bytes)
910
+ const v128_t q8x0 = wasm_v128_load(q8);
911
+ const v128_t q8x1 = wasm_v128_load(q8 + 16);
912
+ const v128_t q8x2 = wasm_v128_load(q8 + 32);
913
+ const v128_t q8x3 = wasm_v128_load(q8 + 48);
914
+ q8 += 64;
915
+
916
+ // Low nibble products
917
+ v128_t vacc1 = wasm_i32x4_dot_i16x8(
918
+ wasm_i16x8_extend_low_i8x16(q4l0),
919
+ wasm_i16x8_extend_low_i8x16(q8x0)
920
+ );
921
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
922
+ wasm_i16x8_extend_high_i8x16(q4l0),
923
+ wasm_i16x8_extend_high_i8x16(q8x0)
924
+ ));
925
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
926
+ wasm_i16x8_extend_low_i8x16(q4l1),
927
+ wasm_i16x8_extend_low_i8x16(q8x1)
928
+ ));
929
+ vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
930
+ wasm_i16x8_extend_high_i8x16(q4l1),
931
+ wasm_i16x8_extend_high_i8x16(q8x1)
932
+ ));
933
+
934
+ // High nibble products
935
+ v128_t vacc2 = wasm_i32x4_dot_i16x8(
936
+ wasm_i16x8_extend_low_i8x16(q4h0),
937
+ wasm_i16x8_extend_low_i8x16(q8x2)
938
+ );
939
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
940
+ wasm_i16x8_extend_high_i8x16(q4h0),
941
+ wasm_i16x8_extend_high_i8x16(q8x2)
942
+ ));
943
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
944
+ wasm_i16x8_extend_low_i8x16(q4h1),
945
+ wasm_i16x8_extend_low_i8x16(q8x3)
946
+ ));
947
+ vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
948
+ wasm_i16x8_extend_high_i8x16(q4h1),
949
+ wasm_i16x8_extend_high_i8x16(q8x3)
950
+ ));
951
+
952
+ // Accumulate scaled results
953
+ int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
954
+ wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
955
+ sumi1 += vacc1_sum * scales[2*j];
956
+
957
+ int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
958
+ wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
959
+ sumi2 += vacc2_sum * scales[2*j+1];
960
+ }
961
+
962
+ sumf += d * (sumi1 + sumi2);
963
+ }
964
+
965
+ *s = sumf;
966
+
967
+ #else
968
+ UNUSED(x);
969
+ UNUSED(y);
970
+ UNUSED(nb);
971
+ UNUSED(kmask1);
972
+ UNUSED(kmask2);
973
+ UNUSED(kmask3);
974
+ UNUSED(utmp);
975
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
976
+ #endif
977
+ }
978
+
979
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
980
+ assert(n % QK_K == 0);
981
+ assert(nrc == 1);
982
+ UNUSED(nrc);
983
+ UNUSED(bx);
984
+ UNUSED(by);
985
+ UNUSED(bs);
986
+
987
+ const block_q5_K * GGML_RESTRICT x = vx;
988
+ const block_q8_K * GGML_RESTRICT y = vy;
989
+
990
+ const int nb = n / QK_K;
991
+
992
+ static const uint32_t kmask1 = 0x3f3f3f3f;
993
+ static const uint32_t kmask2 = 0x0f0f0f0f;
994
+ static const uint32_t kmask3 = 0x03030303;
995
+
996
+ uint32_t utmp[4];
997
+
998
+ #if defined __wasm_simd128__
999
+ //const uint8_t * scales = (const uint8_t*)&utmp[0];
1000
+ float sumf = 0;
1001
+
1002
+ for (int i = 0; i < nb; ++i) {
1003
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1004
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
1005
+
1006
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1007
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1008
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1009
+
1010
+ // Process scales and mins
1011
+ memcpy(utmp, x[i].scales, 12);
1012
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1013
+ const uint32_t uaux = utmp[1] & kmask1;
1014
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1015
+ utmp[2] = uaux;
1016
+ utmp[0] &= kmask1;
1017
+
1018
+ // Sum mins * q8sums
1019
+ int32_t sumi_mins = 0;
1020
+ const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
1021
+ const uint8_t * m = (const uint8_t *)&utmp[2];
1022
+ for (int j = 0; j < 16; j += 2) {
1023
+ sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
1024
+ }
1025
+ sumf -= dmin * sumi_mins; // Correct subtraction
1026
+
1027
+ v128_t qh0 = wasm_v128_load(qh);
1028
+ v128_t qh1 = wasm_v128_load(qh + 16);
1029
+ const uint8_t * sc = (const uint8_t *)utmp;
1030
+
1031
+ int32_t sumi = 0;
1032
+
1033
+ for (int j = 0; j < QK_K/64; ++j) {
1034
+ const int shift = j * 2;
1035
+ v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
1036
+ v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
1037
+
1038
+ v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
1039
+ v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
1040
+ v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
1041
+ v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
1042
+
1043
+ v128_t q5_0 = wasm_v128_load(q5);
1044
+ v128_t q5_1 = wasm_v128_load(q5 + 16);
1045
+ q5 += 32;
1046
+
1047
+ v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
1048
+ v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
1049
+ v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
1050
+ v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
1051
+
1052
+ v128_t q8_0 = wasm_v128_load(q8);
1053
+ v128_t q8_1 = wasm_v128_load(q8 + 16);
1054
+ v128_t q8_2 = wasm_v128_load(q8 + 32);
1055
+ v128_t q8_3 = wasm_v128_load(q8 + 48);
1056
+ q8 += 64;
1057
+
1058
+ // Process low quants
1059
+ v128_t pl0 = wasm_i32x4_dot_i16x8(
1060
+ wasm_i16x8_extend_low_i8x16(q5l_0),
1061
+ wasm_i16x8_extend_low_i8x16(q8_0)
1062
+ );
1063
+ pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
1064
+ wasm_i16x8_extend_high_i8x16(q5l_0),
1065
+ wasm_i16x8_extend_high_i8x16(q8_0)
1066
+ ));
1067
+ v128_t pl1 = wasm_i32x4_dot_i16x8(
1068
+ wasm_i16x8_extend_low_i8x16(q5l_1),
1069
+ wasm_i16x8_extend_low_i8x16(q8_1)
1070
+ );
1071
+ pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
1072
+ wasm_i16x8_extend_high_i8x16(q5l_1),
1073
+ wasm_i16x8_extend_high_i8x16(q8_1)
1074
+ ));
1075
+ v128_t sum_low = wasm_i32x4_add(pl0, pl1);
1076
+
1077
+ // Process high quants
1078
+ v128_t ph0 = wasm_i32x4_dot_i16x8(
1079
+ wasm_i16x8_extend_low_i8x16(q5h_0),
1080
+ wasm_i16x8_extend_low_i8x16(q8_2)
1081
+ );
1082
+ ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
1083
+ wasm_i16x8_extend_high_i8x16(q5h_0),
1084
+ wasm_i16x8_extend_high_i8x16(q8_2)
1085
+ ));
1086
+ v128_t ph1 = wasm_i32x4_dot_i16x8(
1087
+ wasm_i16x8_extend_low_i8x16(q5h_1),
1088
+ wasm_i16x8_extend_low_i8x16(q8_3)
1089
+ );
1090
+ ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
1091
+ wasm_i16x8_extend_high_i8x16(q5h_1),
1092
+ wasm_i16x8_extend_high_i8x16(q8_3)
1093
+ ));
1094
+ v128_t sum_high = wasm_i32x4_add(ph0, ph1);
1095
+
1096
+ // Accumulate with scale factors
1097
+ int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
1098
+ wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
1099
+ int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
1100
+ wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
1101
+
1102
+ sumi += sl * sc[2*j] + sh * sc[2*j+1];
1103
+ }
1104
+
1105
+ sumf += d * sumi;
1106
+ }
1107
+
1108
+ *s = sumf;
1109
+
1110
+ #else
1111
+ UNUSED(x);
1112
+ UNUSED(y);
1113
+ UNUSED(nb);
1114
+ UNUSED(kmask1);
1115
+ UNUSED(kmask2);
1116
+ UNUSED(kmask3);
1117
+ UNUSED(utmp);
1118
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1119
+ #endif
1120
+ }
1121
+
1122
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1123
+ assert(n % QK_K == 0);
1124
+ assert(nrc == 1);
1125
+ UNUSED(nrc);
1126
+ UNUSED(bx);
1127
+ UNUSED(by);
1128
+ UNUSED(bs);
1129
+
1130
+ const block_q6_K * GGML_RESTRICT x = vx;
1131
+ const block_q8_K * GGML_RESTRICT y = vy;
1132
+
1133
+ const int nb = n / QK_K;
1134
+
1135
+ #if defined __wasm_simd128__
1136
+ int8_t aux8[QK_K] __attribute__((aligned(16)));
1137
+ int32_t aux32[8] __attribute__((aligned(16))) = {0};
1138
+ float sums[8] __attribute__((aligned(16))) = {0};
1139
+
1140
+ for (int i = 0; i < nb; ++i) {
1141
+ // Unpack 6-bit quantized data into aux8 (unchanged)
1142
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1143
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1144
+ int8_t * a = aux8;
1145
+ for (int j = 0; j < QK_K; j += 128) {
1146
+ for (int l = 0; l < 32; ++l) {
1147
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1148
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1149
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1150
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1151
+ }
1152
+ a += 128;
1153
+ q4 += 64;
1154
+ qh += 32;
1155
+ }
1156
+
1157
+ const int8_t * GGML_RESTRICT a_ptr = aux8;
1158
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1159
+ v128_t acc0 = wasm_i32x4_splat(0);
1160
+ v128_t acc1 = wasm_i32x4_splat(0);
1161
+
1162
+ for (int j = 0; j < QK_K/16; ++j) {
1163
+ const int scale = x[i].scales[j];
1164
+ const v128_t vscale = wasm_i32x4_splat(scale);
1165
+
1166
+ // Load 16 elements from a and q8
1167
+ const v128_t a_vec = wasm_v128_load(a_ptr);
1168
+ const v128_t q8_vec = wasm_v128_load(q8);
1169
+
1170
+ // Process low 8 elements
1171
+ v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
1172
+ v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
1173
+ v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
1174
+ v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
1175
+ v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
1176
+
1177
+ // Process high 8 elements
1178
+ v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
1179
+ v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
1180
+ v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
1181
+ v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
1182
+ v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
1183
+
1184
+ // Scale and accumulate
1185
+ prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
1186
+ prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
1187
+ prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
1188
+ prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
1189
+
1190
+ acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
1191
+ acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
1192
+
1193
+ a_ptr += 16;
1194
+ q8 += 16;
1195
+ }
1196
+
1197
+ // Store accumulated results
1198
+ wasm_v128_store(&aux32[0], acc0);
1199
+ wasm_v128_store(&aux32[4], acc1);
1200
+
1201
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1202
+ for (int l = 0; l < 8; ++l) {
1203
+ sums[l] += d * aux32[l];
1204
+ }
1205
+ }
1206
+
1207
+ // Sum final results
1208
+ float sumf = 0;
1209
+ for (int l = 0; l < 8; ++l) {
1210
+ sumf += sums[l];
1211
+ }
1212
+ *s = sumf;
1213
+
1214
+ #else
1215
+ UNUSED(x);
1216
+ UNUSED(y);
1217
+ UNUSED(nb);
1218
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1219
+ #endif
1220
+ }
1221
+