whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -23,6 +23,27 @@
23
23
 
24
24
  #define UNUSED GGML_UNUSED
25
25
 
26
+ #if defined(__VXE__) || defined(__VXE2__)
27
+ #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28
+ #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29
+ #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30
+ #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31
+ #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32
+ #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33
+ #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34
+ #define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
+
36
+ // precomputed tables for expanding 8bits to 8 bytes:
37
+ static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
38
+ static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
+
40
+ // permute mask for byteswapping
41
+ static const uint8x16_t v_kperm = (const uint8x16_t){
42
+ 7, 6, 5, 4, 3, 2, 1, 0,
43
+ 15, 14, 13, 12, 11, 10, 9, 8
44
+ };
45
+ #endif
46
+
26
47
  void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
27
48
  assert(QK8_0 == 32);
28
49
  assert(k % QK8_0 == 0);
@@ -32,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
32
53
 
33
54
  #if defined(__VXE__) || defined(__VXE2__)
34
55
  for (int i = 0; i < nb; i++) {
35
- __vector float srcv [8];
36
- __vector float asrcv[8];
37
- __vector float amaxv[8];
56
+ float32x4_t srcv [8];
57
+ float32x4_t asrcv[8];
58
+ float32x4_t amaxv[8];
38
59
 
39
60
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
40
61
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -53,8 +74,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
53
74
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
54
75
 
55
76
  for (int j = 0; j < 8; j++) {
56
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
57
- const __vector int32_t vi = vec_signed(v);
77
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
+ /* Uses non-default rounding for vec_signed or vec_round */
79
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
58
80
 
59
81
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
60
82
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -77,9 +99,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
77
99
 
78
100
  #if defined(__VXE__) || defined(__VXE2__)
79
101
  for (int i = 0; i < nb; i++) {
80
- __vector float srcv [8];
81
- __vector float asrcv[8];
82
- __vector float amaxv[8];
102
+ float32x4_t srcv [8];
103
+ float32x4_t asrcv[8];
104
+ float32x4_t amaxv[8];
83
105
 
84
106
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
85
107
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -97,11 +119,12 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
97
119
 
98
120
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
99
121
 
100
- __vector int32_t acc = vec_splats(0);
122
+ int32x4_t acc = vec_splats(0);
101
123
 
102
124
  for (int j = 0; j < 8; j++) {
103
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
104
- const __vector int32_t vi = vec_signed(v);
125
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
126
+ /* Uses non-default rounding for vec_signed or vec_round */
127
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
105
128
 
106
129
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
107
130
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -141,55 +164,45 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
141
164
  float sumf = 0;
142
165
 
143
166
  #if defined(__VXE__) || defined(__VXE2__)
144
- __vector float acc = vec_splats(0.0f);
167
+ float32x4_t acc = vec_splats(0.0f);
145
168
 
146
- const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
147
- const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
169
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
170
+ const int8x16_t v_s = vec_splats( (const int8_t)0x08);
148
171
 
149
172
  for (; ib < nb; ++ib) {
150
- const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
151
- const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
152
- const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
173
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
174
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
175
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
153
176
 
154
- const __vector int8_t v_xls = vec_sub(v_xl, v_s);
155
- const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
177
+ const int8x16_t v_xls = vec_sub(v_xl, v_s);
178
+ const int8x16_t v_xhs = vec_sub(v_xh, v_s);
156
179
 
157
- const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
158
- const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
180
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
181
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
159
182
 
160
- const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
161
- const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
162
- const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
163
- const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
183
+ const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
184
+ const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
185
+ const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
186
+ const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
164
187
 
165
- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
188
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
166
189
 
167
- const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
168
- const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
190
+ const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
191
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
169
192
 
170
193
  acc = vec_madd(v_xy, v_d, acc);
171
194
  }
172
195
 
173
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
174
-
175
- #endif
176
- for (; ib < nb; ++ib) {
177
- int sumi0 = 0;
178
- int sumi1 = 0;
179
-
180
- for (int j = 0; j < qk/2; ++j) {
181
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
182
- const int v1 = (x[ib].qs[j] >> 4) - 8;
183
-
184
- sumi0 += (v0 * y[ib].qs[j]);
185
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
186
- }
187
-
188
- int sumi = sumi0 + sumi1;
189
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
190
- }
191
-
196
+ sumf = vec_hsum_f32x4(acc);
192
197
  *s = sumf;
198
+ #else
199
+ UNUSED(nb);
200
+ UNUSED(x);
201
+ UNUSED(y);
202
+ UNUSED(ib);
203
+ UNUSED(sumf);
204
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
205
+ #endif
193
206
  }
194
207
 
195
208
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -237,26 +250,406 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
237
250
  acc = vec_madd(v_xy, v_d, acc);
238
251
  }
239
252
 
240
- sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
253
+ sumf = vec_hsum_f32x4(acc) + summs;
254
+ *s = sumf;
255
+ #else
256
+ UNUSED(nb);
257
+ UNUSED(x);
258
+ UNUSED(y);
259
+ UNUSED(ib);
260
+ UNUSED(sumf);
261
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
262
+ #endif
263
+ }
264
+
265
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266
+ assert(nrc == 1);
267
+ UNUSED(nrc);
268
+ UNUSED(bx);
269
+ UNUSED(by);
270
+ UNUSED(bs);
271
+ assert(n % QK_MXFP4 == 0);
272
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
273
+
274
+ const int qk = QK_MXFP4;
275
+ const int nb = n / qk;
276
+
277
+ const block_mxfp4 * GGML_RESTRICT x = vx;
278
+ const block_q8_0 * GGML_RESTRICT y = vy;
279
+
280
+ int ib = 0;
281
+ float sumf = 0.0f;
282
+
283
+ #if defined(__VXE__) || defined(__VXE2__)
284
+ const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
285
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
286
+
287
+ float32x4_t v_acc = vec_splats(0.0f);
288
+
289
+ #pragma GCC unroll 8
290
+ for (; ib + 1 < nb; ib += 2) {
291
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
292
+ const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
293
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
294
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
295
+
296
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
297
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
298
+
299
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
300
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
301
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
302
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
303
+
304
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
305
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
306
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
307
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
308
+
309
+ const int8x16_t v_y0l = vec_xl(0, y0->qs);
310
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
311
+ const int8x16_t v_y1l = vec_xl(0, y1->qs);
312
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
313
+
314
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
315
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
316
+
317
+ const float32x4_t v_xy0f = vec_float(v_xy0);
318
+ const float32x4_t v_xy1f = vec_float(v_xy1);
319
+
320
+ const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
321
+ const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
322
+
323
+ v_acc = vec_madd(v_xy0f, v_d0, v_acc);
324
+ v_acc = vec_madd(v_xy1f, v_d1, v_acc);
325
+ }
326
+
327
+ for (; ib < nb; ++ib) {
328
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
329
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
330
+
331
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
332
+
333
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
334
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
335
+
336
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
337
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
338
+
339
+ const int8x16_t v_yl = vec_xl(0, y0->qs);
340
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
241
341
 
342
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
343
+ const float32x4_t v_xyf = vec_float(v_xy);
344
+
345
+ const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
346
+ v_acc = vec_madd(v_xyf, v_d, v_acc);
347
+ }
348
+
349
+ sumf = vec_hsum_f32x4(v_acc);
350
+ *s = sumf;
351
+ #else
352
+ UNUSED(x);
353
+ UNUSED(y);
354
+ UNUSED(ib);
355
+ UNUSED(sumf);
356
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
242
357
  #endif
358
+ }
359
+
360
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
361
+ const int qk = QK8_0;
362
+ const int nb = n / qk;
363
+
364
+ assert(n % qk == 0);
365
+ assert(qk == QK5_0);
366
+ assert(nrc == 1);
367
+ UNUSED(nrc);
368
+ UNUSED(bx);
369
+ UNUSED(by);
370
+ UNUSED(bs);
371
+
372
+ const block_q5_0 * GGML_RESTRICT x = vx;
373
+ const block_q8_0 * GGML_RESTRICT y = vy;
374
+
375
+ int ib = 0;
376
+ float sumf = 0.0f;
377
+
378
+ #if defined(__VXE__) || defined(__VXE2__)
379
+ float32x4_t v_sum0 = vec_splats(0.0f);
380
+ float32x4_t v_sum1 = vec_splats(0.0f);
381
+
382
+ uint32_t qh0, qh1;
383
+ uint64_t tmp0[4], tmp1[4];
384
+
385
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
386
+
387
+ #pragma GCC unroll 4
388
+ for (; ib + 1 < nb; ib += 2) {
389
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
390
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
391
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
392
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
393
+
394
+ memcpy(&qh0, x0->qh, sizeof(qh0));
395
+ memcpy(&qh1, x1->qh, sizeof(qh1));
396
+
397
+ tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
398
+ tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
399
+ tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
400
+ tmp0[3] = table_b2b_1[(qh0 >> 24) ];
401
+
402
+ tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
403
+ tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
404
+ tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
405
+ tmp1[3] = table_b2b_1[(qh1 >> 24) ];
406
+
407
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
408
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
409
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
410
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
411
+
412
+ // required for fixing the byteorder
413
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
414
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
415
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
416
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
417
+
418
+ const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
419
+ const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
420
+
421
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
422
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
423
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
424
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
425
+
426
+ const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
427
+ const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
428
+ const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
429
+ const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
430
+
431
+ const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
432
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
433
+ const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
434
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
435
+
436
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
437
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
438
+
439
+ const float32x4_t v_xy0f = vec_float(v_xy0);
440
+ const float32x4_t v_xy1f = vec_float(v_xy1);
441
+
442
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
443
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
444
+
445
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
446
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
447
+ }
448
+
449
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
450
+
451
+ #pragma GCC unroll 4
243
452
  for (; ib < nb; ++ib) {
244
- int sumi0 = 0;
245
- int sumi1 = 0;
453
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
454
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
246
455
 
247
- for (int j = 0; j < qk/2; ++j) {
248
- const int v0 = (x[ib].qs[j] & 0x0F);
249
- const int v1 = (x[ib].qs[j] >> 4);
456
+ uint32_t qh;
457
+ memcpy(&qh, x0->qh, sizeof(qh));
250
458
 
251
- sumi0 += (v0 * y[ib].qs[j]);
252
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
253
- }
459
+ uint64_t tmp[4];
460
+ tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
461
+ tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
462
+ tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
463
+ tmp[3] = table_b2b_1[(qh >> 24) ];
464
+
465
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
466
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
467
+
468
+ // required for fixing the byteorder
469
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
470
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
471
+
472
+ const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
473
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
474
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
475
+
476
+ const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
477
+ const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
478
+
479
+ const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
480
+ const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
254
481
 
255
- int sumi = sumi0 + sumi1;
256
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
482
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
483
+ const float32x4_t v_xyf = vec_float(v_xy);
484
+
485
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
486
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
487
+
488
+ sumf += vec_hsum_f32x4(v_acc);
257
489
  }
258
490
 
259
491
  *s = sumf;
492
+ #else
493
+ UNUSED(nb);
494
+ UNUSED(x);
495
+ UNUSED(y);
496
+ UNUSED(ib);
497
+ UNUSED(sumf);
498
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
499
+ #endif
500
+ }
501
+
502
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
503
+ const int qk = QK8_1;
504
+ const int nb = n / qk;
505
+
506
+ assert(n % qk == 0);
507
+ assert(qk == QK5_1);
508
+ assert(nrc == 1);
509
+ UNUSED(nrc);
510
+ UNUSED(bx);
511
+ UNUSED(by);
512
+ UNUSED(bs);
513
+
514
+ const block_q5_1 * GGML_RESTRICT x = vx;
515
+ const block_q8_1 * GGML_RESTRICT y = vy;
516
+
517
+ int ib = 0;
518
+ float sumf = 0.0f;
519
+
520
+ #if defined(__VXE__) || defined(__VXE2__)
521
+ float32x4_t v_sum0 = vec_splats(0.0f);
522
+ float32x4_t v_sum1 = vec_splats(0.0f);
523
+
524
+ float summs0 = 0.0f;
525
+ float summs1 = 0.0f;
526
+
527
+ uint32_t qh0;
528
+ uint32_t qh1;
529
+
530
+ uint64_t tmp0[4];
531
+ uint64_t tmp1[4];
532
+
533
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
534
+
535
+ #pragma GCC unroll 4
536
+ for (; ib + 1 < nb; ib += 2) {
537
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
538
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
539
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
540
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
541
+
542
+ summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
543
+ summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
544
+
545
+ memcpy(&qh0, x0->qh, sizeof(qh0));
546
+ memcpy(&qh1, x1->qh, sizeof(qh1));
547
+
548
+ tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
549
+ tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
550
+ tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
551
+ tmp0[3] = table_b2b_0[(qh0 >> 24) ];
552
+
553
+ tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
554
+ tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
555
+ tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
556
+ tmp1[3] = table_b2b_0[(qh1 >> 24) ];
557
+
558
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
559
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
560
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
561
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
562
+
563
+ // required for fixing the byteorder
564
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
565
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
566
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
567
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
568
+
569
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
570
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
571
+
572
+ const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
573
+ const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
574
+ const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
575
+ const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
576
+
577
+ const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
578
+ const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
579
+ const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
580
+ const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
581
+
582
+ const int8x16_t v_y0l = vec_xl(0 , y0->qs);
583
+ const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
584
+ const int8x16_t v_y1l = vec_xl(0 , y1->qs);
585
+ const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
586
+
587
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
588
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
589
+
590
+ const float32x4_t v_xy0f = vec_float(v_xy0);
591
+ const float32x4_t v_xy1f = vec_float(v_xy1);
592
+
593
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
594
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
595
+
596
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
597
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
598
+ }
599
+
600
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
601
+
602
+ #pragma GCC unroll 4
603
+ for (; ib < nb; ++ib) {
604
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
605
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
606
+
607
+ float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
608
+
609
+ uint32_t qh;
610
+ memcpy(&qh, x0->qh, sizeof(qh));
611
+
612
+ uint64_t tmp[4];
613
+ tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
614
+ tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
615
+ tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
616
+ tmp[3] = table_b2b_0[(qh >> 24) ];
617
+
618
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
619
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
620
+
621
+ // required for fixing the byteorder
622
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
623
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
624
+
625
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
626
+ const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
627
+ const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
628
+
629
+ const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
630
+ const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
631
+
632
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
633
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
634
+
635
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
636
+ const float32x4_t v_xyf = vec_float(v_xy);
637
+
638
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
639
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
640
+
641
+ sumf += vec_hsum_f32x4(v_acc) + summs;
642
+ }
643
+
644
+ *s = sumf;
645
+ #else
646
+ UNUSED(nb);
647
+ UNUSED(x);
648
+ UNUSED(y);
649
+ UNUSED(ib);
650
+ UNUSED(sumf);
651
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
652
+ #endif
260
653
  }
261
654
 
262
655
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -277,7 +670,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
277
670
  float sumf = 0;
278
671
 
279
672
  #if defined(__VXE__) || defined(__VXE2__)
280
- __vector float acc = vec_splats(0.0f);
673
+ float32x4_t acc = vec_splats(0.0f);
281
674
 
282
675
  #pragma GCC unroll 8
283
676
  for (; ib < nb; ++ib) {
@@ -296,20 +689,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
296
689
  acc = vec_madd(v_xy, v_d, acc);
297
690
  }
298
691
 
299
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
300
-
301
- #endif
302
- for (; ib < nb; ++ib) {
303
- int sumi = 0;
304
-
305
- for (int j = 0; j < qk; j++) {
306
- sumi += x[ib].qs[j]*y[ib].qs[j];
307
- }
308
-
309
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
310
- }
692
+ sumf = vec_hsum_f32x4(acc);
311
693
 
312
694
  *s = sumf;
695
+ #else
696
+ UNUSED(nb);
697
+ UNUSED(x);
698
+ UNUSED(y);
699
+ UNUSED(ib);
700
+ UNUSED(sumf);
701
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
702
+ #endif
313
703
  }
314
704
 
315
705
  void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -343,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
343
733
  uint8x16_t q3h[4];
344
734
  uint8x16_t q3b[2];
345
735
  int8x16_t q3bytes[4];
346
- int8x16_t q8bytes[4];
736
+ int8x16_t q8bytes[8];
347
737
  uint8x16_t qhbits[2];
348
738
 
349
739
  float sum = 0;
@@ -423,10 +813,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
423
813
  isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
424
814
  isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
425
815
 
426
- isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
427
- isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
428
- isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
429
- isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
816
+ isum += vec_hsum_i32x4(isum0) * scale[0];
817
+ isum += vec_hsum_i32x4(isum1) * scale[1];
818
+ isum += vec_hsum_i32x4(isum2) * scale[2];
819
+ isum += vec_hsum_i32x4(isum3) * scale[3];
430
820
 
431
821
  scale += 4;
432
822
 
@@ -442,70 +832,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
442
832
  *s = sum;
443
833
 
444
834
  #else
445
- // scalar version
446
- // This function is written like this so the compiler can manage to vectorize most of it
447
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
448
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
449
- // The ideal situation would be if we could just write the code once, and the compiler would
450
- // automatically produce the best possible set of machine instructions, instead of us having to manually
451
- // write vectorized versions for AVX, ARM_NEON, etc.
452
-
453
- int8_t aux8[QK_K];
454
- int16_t aux16[8];
455
- float sums [8];
456
- int32_t aux32[8];
457
- memset(sums, 0, 8*sizeof(float));
458
-
459
- uint32_t auxs[4];
460
- const int8_t * scales = (const int8_t*)auxs;
461
-
462
- float sumf = 0;
463
- for (int i = 0; i < nb; ++i) {
464
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
465
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
466
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
467
- memset(aux32, 0, 8*sizeof(int32_t));
468
- int8_t * GGML_RESTRICT a = aux8;
469
- uint8_t m = 1;
470
- for (int j = 0; j < QK_K; j += 128) {
471
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
472
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
473
- a += 32; m <<= 1;
474
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
475
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
476
- a += 32; m <<= 1;
477
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
478
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
479
- a += 32; m <<= 1;
480
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
481
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
482
- a += 32; m <<= 1;
483
- q3 += 32;
484
- }
485
- a = aux8;
486
-
487
- memcpy(auxs, x[i].scales, 12);
488
- uint32_t tmp = auxs[2];
489
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
490
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
491
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
492
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
493
- for (int j = 0; j < QK_K/16; ++j) {
494
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
495
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
496
- q8 += 8; a += 8;
497
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
498
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
499
- q8 += 8; a += 8;
500
- }
501
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
502
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
503
- }
504
- for (int l = 0; l < 8; ++l) sumf += sums[l];
505
- *s = sumf;
506
-
835
+ UNUSED(kmask1);
836
+ UNUSED(kmask2);
837
+ UNUSED(x);
838
+ UNUSED(y);
839
+ UNUSED(nb);
840
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
507
841
  #endif
508
-
509
842
  }
510
843
 
511
844
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -581,7 +914,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
581
914
  v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
582
915
 
583
916
  const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
584
- sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
917
+ sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
585
918
 
586
919
  v_y[0] = vec_xl(0 , y0);
587
920
  v_y[1] = vec_xl(16, y0);
@@ -591,7 +924,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
591
924
  v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
592
925
 
593
926
  const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
594
- sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
927
+ sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
595
928
  }
596
929
 
597
930
  sumf += d * (sumi1 + sumi2);
@@ -600,61 +933,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
600
933
  *s = sumf;
601
934
 
602
935
  #else
603
-
604
- const uint8_t * scales = (const uint8_t*)&utmp[0];
605
- const uint8_t * mins = (const uint8_t*)&utmp[2];
606
-
607
- int8_t aux8[QK_K];
608
- int16_t aux16[8];
609
- float sums [8];
610
- int32_t aux32[8];
611
- memset(sums, 0, 8*sizeof(float));
612
-
613
- float sumf = 0;
614
- for (int i = 0; i < nb; ++i) {
615
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
616
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
617
- memset(aux32, 0, 8*sizeof(int32_t));
618
- int8_t * GGML_RESTRICT a = aux8;
619
- for (int j = 0; j < QK_K/64; ++j) {
620
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
621
- a += 32;
622
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
623
- a += 32; q4 += 32;
624
- }
625
- memcpy(utmp, x[i].scales, 12);
626
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
627
- const uint32_t uaux = utmp[1] & kmask1;
628
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
629
- utmp[2] = uaux;
630
- utmp[0] &= kmask1;
631
-
632
- int sumi = 0;
633
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
634
- a = aux8;
635
- int is = 0;
636
- for (int j = 0; j < QK_K/32; ++j) {
637
- int32_t scale = scales[is++];
638
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
639
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
640
- q8 += 8; a += 8;
641
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
642
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
643
- q8 += 8; a += 8;
644
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
645
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
646
- q8 += 8; a += 8;
647
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
648
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
649
- q8 += 8; a += 8;
650
- }
651
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
652
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
653
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
654
- sumf -= dmin * sumi;
655
- }
656
- for (int l = 0; l < 8; ++l) sumf += sums[l];
657
- *s = sumf;
936
+ UNUSED(x);
937
+ UNUSED(y);
938
+ UNUSED(nb);
939
+ UNUSED(kmask1);
940
+ UNUSED(kmask2);
941
+ UNUSED(kmask3);
942
+ UNUSED(utmp);
943
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
658
944
  #endif
659
945
  }
660
946
 
@@ -720,7 +1006,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
720
1006
  const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
721
1007
  const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
722
1008
  const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
723
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
1009
+ const int32_t mins = vec_hsum_i32x4(v_mins);
724
1010
 
725
1011
  const uint8_t * scales = (const uint8_t *)utmp;
726
1012
  const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -757,8 +1043,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
757
1043
  int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
758
1044
  int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
759
1045
 
760
- sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
761
- sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
1046
+ sumi += vec_hsum_i32x4(sumi0) * *scales++;
1047
+ sumi += vec_hsum_i32x4(sumi1) * *scales++;
762
1048
  }
763
1049
 
764
1050
  sumf += d * sumi - dmin * mins;
@@ -767,66 +1053,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
767
1053
  *s = sumf;
768
1054
 
769
1055
  #else
770
-
771
- const uint8_t * scales = (const uint8_t*)&utmp[0];
772
- const uint8_t * mins = (const uint8_t*)&utmp[2];
773
-
774
- int8_t aux8[QK_K];
775
- int16_t aux16[8];
776
- float sums [8];
777
- int32_t aux32[8];
778
- memset(sums, 0, 8*sizeof(float));
779
-
780
- float sumf = 0;
781
- for (int i = 0; i < nb; ++i) {
782
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
783
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
784
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
785
- memset(aux32, 0, 8*sizeof(int32_t));
786
- int8_t * GGML_RESTRICT a = aux8;
787
- uint8_t m = 1;
788
- for (int j = 0; j < QK_K/64; ++j) {
789
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
790
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
791
- a += 32; m <<= 1;
792
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
793
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
794
- a += 32; m <<= 1;
795
- q4 += 32;
796
- }
797
- memcpy(utmp, x[i].scales, 12);
798
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
799
- const uint32_t uaux = utmp[1] & kmask1;
800
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
801
- utmp[2] = uaux;
802
- utmp[0] &= kmask1;
803
-
804
- int sumi = 0;
805
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
806
- a = aux8;
807
- int is = 0;
808
- for (int j = 0; j < QK_K/32; ++j) {
809
- int32_t scale = scales[is++];
810
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
811
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
812
- q8 += 8; a += 8;
813
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
814
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
815
- q8 += 8; a += 8;
816
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
817
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
818
- q8 += 8; a += 8;
819
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
820
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
821
- q8 += 8; a += 8;
822
- }
823
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
824
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
825
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
826
- sumf -= dmin * sumi;
827
- }
828
- for (int l = 0; l < 8; ++l) sumf += sums[l];
829
- *s = sumf;
1056
+ UNUSED(x);
1057
+ UNUSED(y);
1058
+ UNUSED(nb);
1059
+ UNUSED(kmask1);
1060
+ UNUSED(kmask2);
1061
+ UNUSED(kmask3);
1062
+ UNUSED(utmp);
1063
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
830
1064
  #endif
831
1065
  }
832
1066
 
@@ -881,7 +1115,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
881
1115
  const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
882
1116
  const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
883
1117
 
884
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
1118
+ const int32_t mins = vec_hsum_i32x4(v_mins);
885
1119
 
886
1120
  int32_t isum = 0;
887
1121
  for (int j = 0; j < QK_K/128; ++j) {
@@ -921,10 +1155,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
921
1155
  int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
922
1156
  int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
923
1157
 
924
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
925
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
926
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
927
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1158
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1159
+ vec_hsum_i32x4(summs1) * scale[1] +
1160
+ vec_hsum_i32x4(summs2) * scale[2] +
1161
+ vec_hsum_i32x4(summs3) * scale[3];
928
1162
 
929
1163
  scale += 4;
930
1164
 
@@ -955,10 +1189,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
955
1189
  summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
956
1190
  summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
957
1191
 
958
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
959
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
960
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
961
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1192
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1193
+ vec_hsum_i32x4(summs1) * scale[1] +
1194
+ vec_hsum_i32x4(summs2) * scale[2] +
1195
+ vec_hsum_i32x4(summs3) * scale[3];
962
1196
 
963
1197
  scale += 4;
964
1198
  }
@@ -969,47 +1203,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
969
1203
  *s = sum;
970
1204
 
971
1205
  #else
972
-
973
- int8_t aux8[QK_K];
974
- int16_t aux16[8];
975
- float sums [8];
976
- int32_t aux32[8];
977
- memset(sums, 0, 8*sizeof(float));
978
-
979
- float sumf = 0;
980
- for (int i = 0; i < nb; ++i) {
981
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
982
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
983
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
984
- memset(aux32, 0, 8*sizeof(int32_t));
985
- int8_t * GGML_RESTRICT a = aux8;
986
- for (int j = 0; j < QK_K; j += 128) {
987
- for (int l = 0; l < 32; ++l) {
988
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
989
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
990
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
991
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
992
- }
993
- a += 128;
994
- q4 += 64;
995
- qh += 32;
996
- }
997
- a = aux8;
998
- int is = 0;
999
- for (int j = 0; j < QK_K/16; ++j) {
1000
- int scale = x[i].scales[is++];
1001
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1002
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1003
- q8 += 8; a += 8;
1004
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1005
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1006
- q8 += 8; a += 8;
1007
- }
1008
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1009
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1010
- }
1011
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1012
- *s = sumf;
1206
+ UNUSED(x);
1207
+ UNUSED(y);
1208
+ UNUSED(nb);
1209
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1013
1210
  #endif
1014
1211
  }
1015
1212
 
@@ -1183,20 +1380,18 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1183
1380
  const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1184
1381
  const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1185
1382
 
1186
- sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1383
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1187
1384
  }
1188
1385
 
1189
- #endif
1190
- for (; ib < nb; ++ib) {
1191
- const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1192
- int sumi1 = 0, sumi2 = 0;
1193
- for (int j = 0; j < QK4_NL/2; ++j) {
1194
- sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1195
- sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
1196
- }
1197
- sumf += d * (sumi1 + sumi2);
1198
- }
1199
1386
  *s = sumf;
1387
+ #else
1388
+ UNUSED(x);
1389
+ UNUSED(y);
1390
+ UNUSED(nb);
1391
+ UNUSED(ib);
1392
+ UNUSED(sumf);
1393
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
1394
+ #endif
1200
1395
  }
1201
1396
 
1202
1397
  void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1254,8 +1449,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1254
1449
 
1255
1450
  h >>= 4;
1256
1451
 
1257
- sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
1258
- sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1452
+ sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1453
+ sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1259
1454
  }
1260
1455
 
1261
1456
  sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
@@ -1264,37 +1459,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1264
1459
  *s = sumf;
1265
1460
 
1266
1461
  #else
1267
- float sumf = 0;
1268
- for (int ibl = 0; ibl < nb; ++ibl) {
1269
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1270
- uint16_t h = x[ibl].scales_h;
1271
- const uint8_t * qs = x[ibl].qs;
1272
- const int8_t * q8 = y[ibl].qs;
1273
- for (int ib = 0; ib < QK_K/32; ib += 2) {
1274
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1275
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
1276
- h >>= 4;
1277
- const float d1 = d4d8*(ls1 - 32);
1278
- const float d2 = d4d8*(ls2 - 32);
1279
- int sumi1 = 0, sumi2 = 0;
1280
- for (int j = 0; j < 16; ++j) {
1281
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1282
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1283
- }
1284
- sumf += d1 * (sumi1 + sumi2);
1285
- qs += 16;
1286
- q8 += 32;
1287
- sumi1 = sumi2 = 0;
1288
- for (int j = 0; j < 16; ++j) {
1289
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1290
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1291
- }
1292
- sumf += d2 * (sumi1 + sumi2);
1293
- qs += 16;
1294
- q8 += 32;
1295
- }
1296
- }
1297
- *s = sumf;
1462
+ UNUSED(x);
1463
+ UNUSED(y);
1464
+ UNUSED(nb);
1465
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1298
1466
  #endif
1299
1467
  }
1300
1468