whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -201,24 +201,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
201
201
 
202
202
  sumf = vec_extract(vsumf0, 0);
203
203
 
204
- #endif
205
- for (; ib < nb; ++ib) {
206
- int sumi0 = 0;
207
- int sumi1 = 0;
208
-
209
- for (int j = 0; j < qk/2; ++j) {
210
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
211
- const int v1 = (x[ib].qs[j] >> 4) - 8;
212
-
213
- sumi0 += (v0 * y[ib].qs[j]);
214
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
215
- }
216
-
217
- int sumi = sumi0 + sumi1;
218
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
219
- }
220
-
221
204
  *s = sumf;
205
+ #else
206
+ UNUSED(x);
207
+ UNUSED(y);
208
+ UNUSED(ib);
209
+ UNUSED(sumf);
210
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
211
+ #endif
222
212
  }
223
213
 
224
214
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -278,24 +268,80 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278
268
 
279
269
  sumf = vec_extract(vsumf0, 0);
280
270
 
271
+ *s = sumf;
272
+ #else
273
+ UNUSED(x);
274
+ UNUSED(y);
275
+ UNUSED(ib);
276
+ UNUSED(sumf);
277
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
281
278
  #endif
279
+ }
280
+
281
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282
+ assert(nrc == 1);
283
+ UNUSED(nrc);
284
+ UNUSED(bx);
285
+ UNUSED(by);
286
+ UNUSED(bs);
287
+ assert(n % QK_MXFP4 == 0);
288
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289
+
290
+ const block_mxfp4 * GGML_RESTRICT x = vx;
291
+ const block_q8_0 * GGML_RESTRICT y = vy;
292
+
293
+ const int nb = n / QK_MXFP4;
294
+
295
+ int ib = 0;
296
+ float sumf = 0;
297
+
298
+ #if defined(__POWER9_VECTOR__)
299
+ const vector signed char lowMask = vec_splats((signed char)0xF);
300
+ const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301
+ vector float vsumf0 = vec_splats(0.0f);
302
+
303
+ vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304
+
305
+ #pragma GCC unroll 8
282
306
  for (; ib < nb; ++ib) {
283
- int sumi0 = 0;
284
- int sumi1 = 0;
307
+ __builtin_prefetch(x[ib].qs, 0, 1);
308
+ __builtin_prefetch(y[ib].qs, 0, 1);
285
309
 
286
- for (int j = 0; j < qk/2; ++j) {
287
- const int v0 = (x[ib].qs[j] & 0x0F);
288
- const int v1 = (x[ib].qs[j] >> 4);
310
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311
+ GGML_E8M0_TO_FP32_HALF(x[ib].e));
289
312
 
290
- sumi0 += (v0 * y[ib].qs[j]);
291
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
292
- }
313
+ vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314
+ vector signed char q8y1 = vec_xl(16, y[ib].qs);
315
+
316
+ vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317
+
318
+ vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319
+ vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320
+
321
+ vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322
+ vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
293
323
 
294
- int sumi = sumi0 + sumi1;
295
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
324
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326
+
327
+ vector signed int vsumi0 = vec_splats((int32_t)0);
328
+ vsumi0 = vec_sum4s(qv0, vsumi0);
329
+ vsumi0 = vec_sum4s(qv1, vsumi0);
330
+
331
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
296
332
  }
297
333
 
334
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336
+ sumf = vec_extract(vsumf0, 0);
298
337
  *s = sumf;
338
+ #else
339
+ UNUSED(x);
340
+ UNUSED(y);
341
+ UNUSED(ib);
342
+ UNUSED(sumf);
343
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344
+ #endif
299
345
  }
300
346
 
301
347
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -360,30 +406,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
360
406
 
361
407
  sumf = vec_extract(vsumf0, 0);
362
408
 
363
- #endif
364
- for (; ib < nb; ++ib) {
365
- uint32_t qh;
366
- memcpy(&qh, x[ib].qh, sizeof(qh));
367
-
368
- int sumi0 = 0;
369
- int sumi1 = 0;
370
-
371
- for (int j = 0; j < qk/2; ++j) {
372
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
373
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
374
-
375
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
376
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
377
-
378
- sumi0 += (x0 * y[ib].qs[j]);
379
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
380
- }
381
-
382
- int sumi = sumi0 + sumi1;
383
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
384
- }
385
-
386
409
  *s = sumf;
410
+ #else
411
+ UNUSED(ib);
412
+ UNUSED(sumf);
413
+ UNUSED(x);
414
+ UNUSED(y);
415
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
416
+ #endif
387
417
  }
388
418
 
389
419
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -451,30 +481,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
451
481
 
452
482
  sumf = vec_extract(vsumf0, 0);
453
483
 
454
- #endif
455
- for (; ib < nb; ++ib) {
456
- uint32_t qh;
457
- memcpy(&qh, x[ib].qh, sizeof(qh));
458
-
459
- int sumi0 = 0;
460
- int sumi1 = 0;
461
-
462
- for (int j = 0; j < qk/2; ++j) {
463
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
464
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
465
-
466
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
467
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
468
-
469
- sumi0 += (x0 * y[ib].qs[j]);
470
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
471
- }
472
-
473
- int sumi = sumi0 + sumi1;
474
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
475
- }
476
-
477
484
  *s = sumf;
485
+ #else
486
+ UNUSED(nb);
487
+ UNUSED(ib);
488
+ UNUSED(sumf);
489
+ UNUSED(x);
490
+ UNUSED(y);
491
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
492
+ #endif
478
493
  }
479
494
 
480
495
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -535,18 +550,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
535
550
 
536
551
  sumf = vec_extract(vsumf0, 0);
537
552
 
538
- #endif
539
- for (; ib < nb; ++ib) {
540
- int sumi = 0;
541
-
542
- for (int j = 0; j < qk; j++) {
543
- sumi += x[ib].qs[j]*y[ib].qs[j];
544
- }
545
-
546
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
547
- }
548
-
549
553
  *s = sumf;
554
+ #else
555
+ UNUSED(nb);
556
+ UNUSED(x);
557
+ UNUSED(y);
558
+ UNUSED(ib);
559
+ UNUSED(sumf);
560
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
561
+ #endif
550
562
  }
551
563
 
552
564
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -695,45 +707,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
695
707
  *s = vec_extract(vsumf0, 0);
696
708
 
697
709
  #else
698
-
699
- float sumf = 0;
700
-
701
- for (int i = 0; i < nb; ++i) {
702
-
703
- const uint8_t * q2 = x[i].qs;
704
- const int8_t * q8 = y[i].qs;
705
- const uint8_t * sc = x[i].scales;
706
-
707
- int summs = 0;
708
- for (int j = 0; j < 16; ++j) {
709
- summs += y[i].bsums[j] * (sc[j] >> 4);
710
- }
711
-
712
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
713
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
714
-
715
- int isum = 0;
716
- int is = 0;
717
- int d;
718
- for (int k = 0; k < QK_K/128; ++k) {
719
- int shift = 0;
720
- for (int j = 0; j < 4; ++j) {
721
- d = sc[is++] & 0xF;
722
- int isuml = 0;
723
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
724
- isum += d * isuml;
725
- d = sc[is++] & 0xF;
726
- isuml = 0;
727
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
728
- isum += d * isuml;
729
- shift += 2;
730
- q8 += 32;
731
- }
732
- q2 += 32;
733
- }
734
- sumf += dall * isum - dmin * summs;
735
- }
736
- *s = sumf;
710
+ UNUSED(x);
711
+ UNUSED(y);
712
+ UNUSED(nb);
713
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
737
714
  #endif
738
715
  }
739
716
 
@@ -907,70 +884,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
907
884
  *s = vec_extract(vsumf0, 0);
908
885
 
909
886
  #else
910
- // scalar version
911
- // This function is written like this so the compiler can manage to vectorize most of it
912
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
913
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
914
- // The ideal situation would be if we could just write the code once, and the compiler would
915
- // automatically produce the best possible set of machine instructions, instead of us having to manually
916
- // write vectorized versions for AVX, ARM_NEON, etc.
917
-
918
- int8_t aux8[QK_K];
919
- int16_t aux16[8];
920
- float sums [8];
921
- int32_t aux32[8];
922
- memset(sums, 0, 8*sizeof(float));
923
-
924
- uint32_t auxs[4];
925
- const int8_t * scales = (const int8_t*)auxs;
926
-
927
- float sumf = 0;
928
- for (int i = 0; i < nb; ++i) {
929
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
930
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
931
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
932
- memset(aux32, 0, 8*sizeof(int32_t));
933
- int8_t * GGML_RESTRICT a = aux8;
934
- uint8_t m = 1;
935
- for (int j = 0; j < QK_K; j += 128) {
936
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
937
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
938
- a += 32; m <<= 1;
939
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
940
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
941
- a += 32; m <<= 1;
942
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
943
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
944
- a += 32; m <<= 1;
945
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
946
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
947
- a += 32; m <<= 1;
948
- q3 += 32;
949
- }
950
- a = aux8;
951
-
952
- memcpy(auxs, x[i].scales, 12);
953
- uint32_t tmp = auxs[2];
954
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
955
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
956
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
957
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
958
- for (int j = 0; j < QK_K/16; ++j) {
959
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
960
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
961
- q8 += 8; a += 8;
962
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
963
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
964
- q8 += 8; a += 8;
965
- }
966
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
967
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
968
- }
969
- for (int l = 0; l < 8; ++l) sumf += sums[l];
970
- *s = sumf;
971
-
887
+ UNUSED(kmask1);
888
+ UNUSED(kmask2);
889
+ UNUSED(x);
890
+ UNUSED(y);
891
+ UNUSED(nb);
892
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
972
893
  #endif
973
-
974
894
  }
975
895
 
976
896
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1130,61 +1050,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1130
1050
  *s = vec_extract(vsumf0, 0);
1131
1051
 
1132
1052
  #else
1133
-
1134
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1135
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1136
-
1137
- int8_t aux8[QK_K];
1138
- int16_t aux16[8];
1139
- float sums [8];
1140
- int32_t aux32[8];
1141
- memset(sums, 0, 8*sizeof(float));
1142
-
1143
- float sumf = 0;
1144
- for (int i = 0; i < nb; ++i) {
1145
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1146
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1147
- memset(aux32, 0, 8*sizeof(int32_t));
1148
- int8_t * GGML_RESTRICT a = aux8;
1149
- for (int j = 0; j < QK_K/64; ++j) {
1150
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1151
- a += 32;
1152
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1153
- a += 32; q4 += 32;
1154
- }
1155
- memcpy(utmp, x[i].scales, 12);
1156
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1157
- const uint32_t uaux = utmp[1] & kmask1;
1158
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1159
- utmp[2] = uaux;
1160
- utmp[0] &= kmask1;
1161
-
1162
- int sumi = 0;
1163
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1164
- a = aux8;
1165
- int is = 0;
1166
- for (int j = 0; j < QK_K/32; ++j) {
1167
- int32_t scale = scales[is++];
1168
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1169
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1170
- q8 += 8; a += 8;
1171
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1172
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1173
- q8 += 8; a += 8;
1174
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1175
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1176
- q8 += 8; a += 8;
1177
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1178
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1179
- q8 += 8; a += 8;
1180
- }
1181
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1182
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1183
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1184
- sumf -= dmin * sumi;
1185
- }
1186
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1187
- *s = sumf;
1053
+ UNUSED(x);
1054
+ UNUSED(y);
1055
+ UNUSED(nb);
1056
+ UNUSED(kmask1);
1057
+ UNUSED(kmask2);
1058
+ UNUSED(kmask3);
1059
+ UNUSED(utmp);
1060
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1188
1061
  #endif
1189
1062
  }
1190
1063
 
@@ -1342,66 +1215,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1342
1215
  *s = vec_extract(vsumf0, 0);
1343
1216
 
1344
1217
  #else
1345
-
1346
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1347
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1348
-
1349
- int8_t aux8[QK_K];
1350
- int16_t aux16[8];
1351
- float sums [8];
1352
- int32_t aux32[8];
1353
- memset(sums, 0, 8*sizeof(float));
1354
-
1355
- float sumf = 0;
1356
- for (int i = 0; i < nb; ++i) {
1357
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1358
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1359
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1360
- memset(aux32, 0, 8*sizeof(int32_t));
1361
- int8_t * GGML_RESTRICT a = aux8;
1362
- uint8_t m = 1;
1363
- for (int j = 0; j < QK_K/64; ++j) {
1364
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1365
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1366
- a += 32; m <<= 1;
1367
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1368
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1369
- a += 32; m <<= 1;
1370
- q4 += 32;
1371
- }
1372
- memcpy(utmp, x[i].scales, 12);
1373
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1374
- const uint32_t uaux = utmp[1] & kmask1;
1375
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1376
- utmp[2] = uaux;
1377
- utmp[0] &= kmask1;
1378
-
1379
- int sumi = 0;
1380
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1381
- a = aux8;
1382
- int is = 0;
1383
- for (int j = 0; j < QK_K/32; ++j) {
1384
- int32_t scale = scales[is++];
1385
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1386
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1387
- q8 += 8; a += 8;
1388
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1389
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1390
- q8 += 8; a += 8;
1391
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1392
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1393
- q8 += 8; a += 8;
1394
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1395
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1396
- q8 += 8; a += 8;
1397
- }
1398
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1399
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1400
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1401
- sumf -= dmin * sumi;
1402
- }
1403
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1404
- *s = sumf;
1218
+ UNUSED(x);
1219
+ UNUSED(y);
1220
+ UNUSED(nb);
1221
+ UNUSED(kmask1);
1222
+ UNUSED(kmask2);
1223
+ UNUSED(kmask3);
1224
+ UNUSED(utmp);
1225
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1405
1226
  #endif
1406
1227
  }
1407
1228
 
@@ -1556,47 +1377,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1556
1377
  *s = vec_extract(vsumf0, 0);
1557
1378
 
1558
1379
  #else
1559
-
1560
- int8_t aux8[QK_K];
1561
- int16_t aux16[8];
1562
- float sums [8];
1563
- int32_t aux32[8];
1564
- memset(sums, 0, 8*sizeof(float));
1565
-
1566
- float sumf = 0;
1567
- for (int i = 0; i < nb; ++i) {
1568
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1569
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
1570
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1571
- memset(aux32, 0, 8*sizeof(int32_t));
1572
- int8_t * GGML_RESTRICT a = aux8;
1573
- for (int j = 0; j < QK_K; j += 128) {
1574
- for (int l = 0; l < 32; ++l) {
1575
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1576
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1577
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1578
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1579
- }
1580
- a += 128;
1581
- q4 += 64;
1582
- qh += 32;
1583
- }
1584
- a = aux8;
1585
- int is = 0;
1586
- for (int j = 0; j < QK_K/16; ++j) {
1587
- int scale = x[i].scales[is++];
1588
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1589
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1590
- q8 += 8; a += 8;
1591
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1592
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1593
- q8 += 8; a += 8;
1594
- }
1595
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1596
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1597
- }
1598
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1599
- *s = sumf;
1380
+ UNUSED(x);
1381
+ UNUSED(y);
1382
+ UNUSED(nb);
1383
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1600
1384
  #endif
1601
1385
  }
1602
1386
 
@@ -1737,34 +1521,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1737
1521
  *s = 0.125f * vec_extract(vsumf0, 0);
1738
1522
 
1739
1523
  #else
1740
-
1741
- uint32_t aux32[2];
1742
- const uint8_t * aux8 = (const uint8_t *)aux32;
1743
-
1744
- float sumf = 0.f;
1745
- for (int i = 0; i < nb; ++i) {
1746
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1747
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1748
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1749
- int32_t bsum = 0;
1750
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1751
- memcpy(aux32, q2, 2*sizeof(uint32_t));
1752
- q2 += 4;
1753
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1754
- int32_t sumi = 0;
1755
- for (int l = 0; l < 4; ++l) {
1756
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1757
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1758
- for (int j = 0; j < 8; ++j) {
1759
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1760
- }
1761
- q8 += 8;
1762
- }
1763
- bsum += sumi * ls;
1764
- }
1765
- sumf += d * bsum;
1766
- }
1767
- *s = 0.125f * sumf;
1524
+ UNUSED(x);
1525
+ UNUSED(y);
1526
+ UNUSED(nb);
1527
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1768
1528
  #endif
1769
1529
  }
1770
1530
 
@@ -1869,42 +1629,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1869
1629
  *s = 0.125f * vec_extract(vsumf0, 0);
1870
1630
 
1871
1631
  #else
1872
-
1873
- float sumf = 0.f;
1874
- for (int i = 0; i < nb; ++i) {
1875
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1876
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1877
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
1878
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1879
- int32_t bsum = 0;
1880
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1881
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
1882
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
1883
- int32_t sumi = 0;
1884
- for (int l = 0; l < 2; ++l) {
1885
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1886
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1887
- for (int j = 0; j < 8; ++j) {
1888
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1889
- }
1890
- q8 += 8;
1891
- }
1892
- bsum += sumi * ls1;
1893
- sumi = 0;
1894
- for (int l = 2; l < 4; ++l) {
1895
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1896
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1897
- for (int j = 0; j < 8; ++j) {
1898
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1899
- }
1900
- q8 += 8;
1901
- }
1902
- bsum += sumi * ls2;
1903
- q2 += 4;
1904
- }
1905
- sumf += d * bsum;
1906
- }
1907
- *s = 0.125f * sumf;
1632
+ UNUSED(x);
1633
+ UNUSED(y);
1634
+ UNUSED(nb);
1635
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1908
1636
  #endif
1909
1637
  }
1910
1638
 
@@ -2030,47 +1758,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2030
1758
  *s = 0.125f * vec_extract(vsumf0, 0);
2031
1759
 
2032
1760
  #else
2033
-
2034
- float sumf = 0;
2035
- for (int i = 0; i < nb; i++) {
2036
-
2037
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2038
- const int8_t * q8 = y[i].qs;
2039
- const uint8_t * qs = x[i].qs;
2040
- const uint8_t * qh = x[i].qh;
2041
- const uint8_t * signs = qs + QK_K/8;
2042
-
2043
- int bsum = 0;
2044
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2045
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
2046
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
2047
- int sumi1 = 0, sumi2 = 0;
2048
- for (int l = 0; l < 2; ++l) {
2049
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2050
- for (int j = 0; j < 8; ++j) {
2051
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2052
- }
2053
- q8 += 8;
2054
- }
2055
- for (int l = 2; l < 4; ++l) {
2056
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2057
- for (int j = 0; j < 8; ++j) {
2058
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2059
- }
2060
- q8 += 8;
2061
- }
2062
- bsum += ls1 * sumi1 + ls2 * sumi2;
2063
- qs += 4;
2064
- signs += 4;
2065
- }
2066
-
2067
- sumf += d * bsum;
2068
- }
2069
-
2070
- *s = 0.125f * sumf;
2071
-
1761
+ UNUSED(x);
1762
+ UNUSED(y);
1763
+ UNUSED(nb);
1764
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2072
1765
  #endif
2073
-
2074
1766
  }
2075
1767
 
2076
1768
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2172,36 +1864,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2172
1864
  *s = 0.25f * vec_extract(vsumf0, 0);
2173
1865
 
2174
1866
  #else
2175
-
2176
- uint32_t aux32;
2177
-
2178
- float sumf = 0.f;
2179
- for (int i = 0; i < nb; ++i) {
2180
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2181
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2182
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2183
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2184
- int32_t bsum = 0;
2185
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2186
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
2187
- const uint32_t ls = 2*(aux32 >> 28) + 1;
2188
- int32_t sumi = 0;
2189
- for (int l = 0; l < 4; ++l) {
2190
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
2191
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
2192
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
2193
- for (int j = 0; j < 4; ++j) {
2194
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
2195
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
2196
- }
2197
- q8 += 8;
2198
- }
2199
- q3 += 8;
2200
- bsum += sumi * ls;
2201
- }
2202
- sumf += d * bsum;
2203
- }
2204
- *s = 0.25f * sumf;
1867
+ UNUSED(x);
1868
+ UNUSED(y);
1869
+ UNUSED(nb);
1870
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2205
1871
  #endif
2206
1872
  }
2207
1873
 
@@ -2327,48 +1993,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2327
1993
  *s = vec_extract(vsumf0, 0);
2328
1994
 
2329
1995
  #else
2330
-
2331
- float sumf = 0.f;
2332
- for (int i = 0; i < nb; ++i) {
2333
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2334
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
2335
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2336
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
2337
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2338
- int32_t bsum = 0;
2339
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2340
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
2341
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
2342
- int32_t sumi = 0;
2343
- for (int l = 0; l < 4; ++l) {
2344
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
2345
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
2346
- for (int j = 0; j < 4; ++j) {
2347
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2348
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2349
- }
2350
- q8 += 8;
2351
- }
2352
- qs += 8;
2353
- signs += 4;
2354
- bsum += sumi * ls1;
2355
- sumi = 0;
2356
- for (int l = 0; l < 4; ++l) {
2357
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
2358
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
2359
- for (int j = 0; j < 4; ++j) {
2360
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2361
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2362
- }
2363
- q8 += 8;
2364
- }
2365
- qs += 8;
2366
- signs += 4;
2367
- bsum += sumi * ls2;
2368
- }
2369
- sumf += d * bsum;
2370
- }
2371
- *s = sumf;
1996
+ UNUSED(x);
1997
+ UNUSED(y);
1998
+ UNUSED(nb);
1999
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2372
2000
  #endif
2373
2001
  }
2374
2002
 
@@ -2481,36 +2109,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2481
2109
  *s = vec_extract(vsumf0, 0);
2482
2110
 
2483
2111
  #else
2484
-
2485
- float sumf = 0;
2486
- for (int i = 0; i < nb; i++) {
2487
-
2488
- const int8_t * q8 = y[i].qs;
2489
- const uint8_t * qs = x[i].qs;
2490
- const uint16_t * qh = x[i].qh;
2491
-
2492
- int sumi = 0, sumi1 = 0;
2493
- for (int ib = 0; ib < QK_K/32; ++ib) {
2494
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
2495
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
2496
- int lsum = 0;
2497
- for (int l = 0; l < 4; ++l) {
2498
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
2499
- for (int j = 0; j < 8; ++j) {
2500
- lsum += q8[j] * grid[j];
2501
- }
2502
- q8 += 8;
2503
- }
2504
- sumi += ls * lsum;
2505
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
2506
- qs += 4;
2507
- }
2508
-
2509
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2510
- }
2511
-
2512
- *s = sumf;
2513
-
2112
+ UNUSED(x);
2113
+ UNUSED(y);
2114
+ UNUSED(nb);
2115
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2514
2116
  #endif
2515
2117
  }
2516
2118
 
@@ -2581,17 +2183,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2581
2183
 
2582
2184
  sumf = vec_extract(vsumf0, 0);
2583
2185
 
2584
- #endif
2585
- for (; ib < nb; ++ib) {
2586
- const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
2587
- int sumi1 = 0, sumi2 = 0;
2588
- for (int j = 0; j < QK4_NL/2; ++j) {
2589
- sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
2590
- sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
2591
- }
2592
- sumf += d * (sumi1 + sumi2);
2593
- }
2594
2186
  *s = sumf;
2187
+ #else
2188
+ UNUSED(x);
2189
+ UNUSED(y);
2190
+ UNUSED(nb);
2191
+ UNUSED(ib);
2192
+ UNUSED(sumf);
2193
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
2194
+ #endif
2595
2195
  }
2596
2196
 
2597
2197
  void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2696,37 +2296,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2696
2296
  *s = vec_extract(vsumf0, 0);
2697
2297
 
2698
2298
  #else
2699
- float sumf = 0;
2700
- for (int ibl = 0; ibl < nb; ++ibl) {
2701
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2702
- uint16_t h = x[ibl].scales_h;
2703
- const uint8_t * qs = x[ibl].qs;
2704
- const int8_t * q8 = y[ibl].qs;
2705
- for (int ib = 0; ib < QK_K/32; ib += 2) {
2706
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
2707
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
2708
- h >>= 4;
2709
- const float d1 = d4d8*(ls1 - 32);
2710
- const float d2 = d4d8*(ls2 - 32);
2711
- int sumi1 = 0, sumi2 = 0;
2712
- for (int j = 0; j < 16; ++j) {
2713
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2714
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2715
- }
2716
- sumf += d1 * (sumi1 + sumi2);
2717
- qs += 16;
2718
- q8 += 32;
2719
- sumi1 = sumi2 = 0;
2720
- for (int j = 0; j < 16; ++j) {
2721
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2722
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2723
- }
2724
- sumf += d2 * (sumi1 + sumi2);
2725
- qs += 16;
2726
- q8 += 32;
2727
- }
2728
- }
2729
- *s = sumf;
2299
+ UNUSED(x);
2300
+ UNUSED(y);
2301
+ UNUSED(nb);
2302
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2730
2303
  #endif
2731
2304
  }
2732
2305