whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -66,6 +66,12 @@ static inline int hsum_i32_4(const __m128i a) {
66
66
  }
67
67
 
68
68
  #if defined(__AVX2__) || defined(__AVX512F__)
69
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
70
+ const __m256i ax = _mm256_sign_epi8(x, x);
71
+ const __m256i sy = _mm256_sign_epi8(y, x);
72
+ return _mm256_maddubs_epi16(ax, sy);
73
+ }
74
+
69
75
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
70
76
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
71
77
  uint32_t x32;
@@ -261,6 +267,11 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
261
267
  return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
262
268
  _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
263
269
  }
270
+
271
+ static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
272
+ return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
273
+ _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
274
+ }
264
275
  #endif
265
276
  #elif defined(__SSSE3__)
266
277
  // horizontally add 4x4 floats
@@ -702,7 +713,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
702
713
  const block_q8_1 * GGML_RESTRICT y = vy;
703
714
 
704
715
  int ib = 0;
705
- float sumf = 0;
706
716
 
707
717
  #if defined(__AVX2__) || defined(__AVX__)
708
718
  // Initialize accumulator with zeros
@@ -737,25 +747,98 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
737
747
  #endif
738
748
  }
739
749
 
740
- sumf = hsum_float_8(acc) + summs;
741
-
750
+ *s = hsum_float_8(acc) + summs;
751
+ #else
752
+ UNUSED(nb);
753
+ UNUSED(x);
754
+ UNUSED(y);
755
+ UNUSED(ib);
756
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
742
757
  #endif
743
- for (; ib < nb; ++ib) {
744
- int sumi0 = 0;
745
- int sumi1 = 0;
758
+ }
746
759
 
747
- for (int j = 0; j < qk/2; ++j) {
748
- const int v0 = (x[ib].qs[j] & 0x0F);
749
- const int v1 = (x[ib].qs[j] >> 4);
760
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
761
+ assert(nrc == 1);
762
+ UNUSED(nrc);
763
+ UNUSED(bx);
764
+ UNUSED(by);
765
+ UNUSED(bs);
766
+ assert(n % QK_MXFP4 == 0);
767
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
750
768
 
751
- sumi0 += (v0 * y[ib].qs[j]);
752
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
753
- }
769
+ const block_mxfp4 * GGML_RESTRICT x = vx;
770
+ const block_q8_0 * GGML_RESTRICT y = vy;
754
771
 
755
- int sumi = sumi0 + sumi1;
756
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
772
+ const int nb = n / QK_MXFP4;
773
+
774
+ int ib = 0;
775
+ float sumf = 0;
776
+
777
+ #if defined __AVX2__
778
+
779
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
780
+ const __m128i m4b = _mm_set1_epi8(0x0f);
781
+ const __m256i mone = _mm256_set1_epi16(1);
782
+
783
+ __m256 accum1 = _mm256_setzero_ps();
784
+ __m256 accum2 = _mm256_setzero_ps();
785
+ for (; ib + 1 < nb; ib += 2) {
786
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
787
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
788
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
789
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
790
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
791
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
792
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
793
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
794
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
795
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
796
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
797
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
798
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
799
+ _mm256_cvtepi32_ps(p_1), accum1);
800
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
801
+ _mm256_cvtepi32_ps(p_2), accum2);
757
802
  }
758
803
 
804
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
805
+
806
+ #elif defined __AVX__
807
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
808
+ const __m128i m4b = _mm_set1_epi8(0x0f);
809
+
810
+ __m256 accum = _mm256_setzero_ps();
811
+ for (; ib + 1 < nb; ib += 2) {
812
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
813
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
814
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
815
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
816
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
817
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
818
+
819
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
820
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
821
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
822
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
823
+
824
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
825
+ const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
826
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
827
+ }
828
+
829
+ sumf = hsum_float_8(accum);
830
+
831
+ #endif
832
+ for (; ib < nb; ++ib) {
833
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
834
+ int sumi1 = 0;
835
+ int sumi2 = 0;
836
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
837
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
838
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
839
+ }
840
+ sumf += d * (sumi1 + sumi2);
841
+ }
759
842
  *s = sumf;
760
843
  }
761
844
 
@@ -764,7 +847,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
764
847
  const int nb = n / qk;
765
848
 
766
849
  int ib = 0;
767
- float sumf = 0;
768
850
 
769
851
  assert(n % qk == 0);
770
852
  assert(qk == QK5_0);
@@ -799,7 +881,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
799
881
  acc = _mm256_fmadd_ps(d, q, acc);
800
882
  }
801
883
 
802
- sumf = hsum_float_8(acc);
884
+ *s = hsum_float_8(acc);
803
885
  #elif defined(__AVX__)
804
886
  // Initialize accumulator with zeros
805
887
  __m256 acc = _mm256_setzero_ps();
@@ -830,32 +912,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
830
912
  acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
831
913
  }
832
914
 
833
- sumf = hsum_float_8(acc);
834
-
915
+ *s = hsum_float_8(acc);
916
+ #else
917
+ UNUSED(nb);
918
+ UNUSED(ib);
919
+ UNUSED(x);
920
+ UNUSED(y);
921
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
835
922
  #endif
836
- for (; ib < nb; ++ib) {
837
- uint32_t qh;
838
- memcpy(&qh, x[ib].qh, sizeof(qh));
839
-
840
- int sumi0 = 0;
841
- int sumi1 = 0;
842
-
843
- for (int j = 0; j < qk/2; ++j) {
844
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
845
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
846
-
847
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
848
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
849
-
850
- sumi0 += (x0 * y[ib].qs[j]);
851
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
852
- }
853
-
854
- int sumi = sumi0 + sumi1;
855
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
856
- }
857
-
858
- *s = sumf;
859
923
  }
860
924
 
861
925
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -863,7 +927,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
863
927
  const int nb = n / qk;
864
928
 
865
929
  int ib = 0;
866
- float sumf = 0;
867
930
 
868
931
  assert(n % qk == 0);
869
932
  assert(qk == QK5_1);
@@ -901,7 +964,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
901
964
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
902
965
  }
903
966
 
904
- sumf = hsum_float_8(acc) + summs;
967
+ *s = hsum_float_8(acc) + summs;
905
968
  #elif defined(__AVX__)
906
969
  // Initialize accumulator with zeros
907
970
  __m256 acc = _mm256_setzero_ps();
@@ -935,32 +998,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
935
998
  acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
936
999
  }
937
1000
 
938
- sumf = hsum_float_8(acc) + summs;
939
-
1001
+ *s = hsum_float_8(acc) + summs;
1002
+ #else
1003
+ UNUSED(nb);
1004
+ UNUSED(ib);
1005
+ UNUSED(x);
1006
+ UNUSED(y);
1007
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
940
1008
  #endif
941
- for (; ib < nb; ++ib) {
942
- uint32_t qh;
943
- memcpy(&qh, x[ib].qh, sizeof(qh));
944
-
945
- int sumi0 = 0;
946
- int sumi1 = 0;
947
-
948
- for (int j = 0; j < qk/2; ++j) {
949
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
950
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
951
-
952
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
953
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
954
-
955
- sumi0 += (x0 * y[ib].qs[j]);
956
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
957
- }
958
-
959
- int sumi = sumi0 + sumi1;
960
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
961
- }
962
-
963
- *s = sumf;
964
1009
  }
965
1010
 
966
1011
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1017,7 +1062,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1017
1062
  }
1018
1063
 
1019
1064
  sumf = hsum_float_8(accum);
1020
-
1021
1065
  #endif
1022
1066
  for (; ib < nb; ++ib) {
1023
1067
  int sumi = 0;
@@ -1157,44 +1201,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1157
1201
  *s = hsum_float_8(sumf);
1158
1202
 
1159
1203
  #else
1160
- const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
1161
-
1162
- float sumf = 0.0f;
1163
-
1164
- for (int i = 0; i < nb; ++i) {
1165
- int sum = 0;
1166
-
1167
- for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
1168
- for (size_t l = 0; l < 5; ++l) {
1169
- for (size_t m = 0; m < 32; ++m) {
1170
- uint8_t q = x[i].qs[j + m] * pow3[l];
1171
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1172
- sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
1173
- }
1174
- }
1175
- }
1176
- for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
1177
- for (size_t l = 0; l < 5; ++l) {
1178
- for (size_t m = 0; m < 16; ++m) {
1179
- uint8_t q = x[i].qs[j + m] * pow3[l];
1180
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1181
- sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
1182
- }
1183
- }
1184
- }
1185
-
1186
- for (size_t l = 0; l < 4; ++l) {
1187
- for (size_t j = 0; j < sizeof(x->qh); ++j) {
1188
- uint8_t q = x[i].qh[j] * pow3[l];
1189
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1190
- sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
1191
- }
1192
- }
1193
-
1194
- sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1195
- }
1196
-
1197
- *s = sumf;
1204
+ UNUSED(x);
1205
+ UNUSED(y);
1206
+ UNUSED(nb);
1207
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1198
1208
  #endif
1199
1209
  }
1200
1210
 
@@ -1257,25 +1267,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1257
1267
  *s = hsum_float_8(sumf);
1258
1268
 
1259
1269
  #else
1260
- float sumf = 0.0f;
1261
-
1262
- for (int i = 0; i < nb; ++i) {
1263
- int32_t sumi = 0;
1264
-
1265
- for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1266
- for (size_t l = 0; l < 4; ++l) {
1267
- for (size_t k = 0; k < 32; ++k) {
1268
- sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
1269
- }
1270
- }
1271
- }
1272
-
1273
- const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1274
-
1275
- sumf += (float) sumi * d;
1276
- }
1277
-
1278
- *s = sumf;
1270
+ UNUSED(x);
1271
+ UNUSED(y);
1272
+ UNUSED(nb);
1273
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1279
1274
  #endif
1280
1275
  }
1281
1276
 
@@ -1464,45 +1459,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1464
1459
  *s = hsum_float_8(acc);
1465
1460
 
1466
1461
  #else
1467
-
1468
- float sumf = 0;
1469
-
1470
- for (int i = 0; i < nb; ++i) {
1471
-
1472
- const uint8_t * q2 = x[i].qs;
1473
- const int8_t * q8 = y[i].qs;
1474
- const uint8_t * sc = x[i].scales;
1475
-
1476
- int summs = 0;
1477
- for (int j = 0; j < 16; ++j) {
1478
- summs += y[i].bsums[j] * (sc[j] >> 4);
1479
- }
1480
-
1481
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1482
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1483
-
1484
- int isum = 0;
1485
- int is = 0;
1486
- int d;
1487
- for (int k = 0; k < QK_K/128; ++k) {
1488
- int shift = 0;
1489
- for (int j = 0; j < 4; ++j) {
1490
- d = sc[is++] & 0xF;
1491
- int isuml = 0;
1492
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1493
- isum += d * isuml;
1494
- d = sc[is++] & 0xF;
1495
- isuml = 0;
1496
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1497
- isum += d * isuml;
1498
- shift += 2;
1499
- q8 += 32;
1500
- }
1501
- q2 += 32;
1502
- }
1503
- sumf += dall * isum - dmin * summs;
1504
- }
1505
- *s = sumf;
1462
+ UNUSED(x);
1463
+ UNUSED(y);
1464
+ UNUSED(nb);
1465
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1506
1466
  #endif
1507
1467
  }
1508
1468
 
@@ -1769,70 +1729,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1769
1729
  *s = hsum_float_8(acc);
1770
1730
 
1771
1731
  #else
1772
- // scalar version
1773
- // This function is written like this so the compiler can manage to vectorize most of it
1774
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1775
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1776
- // The ideal situation would be if we could just write the code once, and the compiler would
1777
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1778
- // write vectorized versions for AVX, ARM_NEON, etc.
1779
-
1780
- int8_t aux8[QK_K];
1781
- int16_t aux16[8];
1782
- float sums [8];
1783
- int32_t aux32[8];
1784
- memset(sums, 0, 8*sizeof(float));
1785
-
1786
- uint32_t auxs[4];
1787
- const int8_t * scales = (const int8_t*)auxs;
1788
-
1789
- float sumf = 0;
1790
- for (int i = 0; i < nb; ++i) {
1791
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1792
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1793
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1794
- memset(aux32, 0, 8*sizeof(int32_t));
1795
- int8_t * GGML_RESTRICT a = aux8;
1796
- uint8_t m = 1;
1797
- for (int j = 0; j < QK_K; j += 128) {
1798
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1799
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1800
- a += 32; m <<= 1;
1801
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1802
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1803
- a += 32; m <<= 1;
1804
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1805
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1806
- a += 32; m <<= 1;
1807
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1808
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1809
- a += 32; m <<= 1;
1810
- q3 += 32;
1811
- }
1812
- a = aux8;
1813
-
1814
- memcpy(auxs, x[i].scales, 12);
1815
- uint32_t tmp = auxs[2];
1816
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1817
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1818
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1819
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1820
- for (int j = 0; j < QK_K/16; ++j) {
1821
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1822
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1823
- q8 += 8; a += 8;
1824
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1825
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1826
- q8 += 8; a += 8;
1827
- }
1828
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1829
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1830
- }
1831
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1832
- *s = sumf;
1833
-
1732
+ UNUSED(kmask1);
1733
+ UNUSED(kmask2);
1734
+ UNUSED(x);
1735
+ UNUSED(y);
1736
+ UNUSED(nb);
1737
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1834
1738
  #endif
1835
-
1836
1739
  }
1837
1740
 
1838
1741
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2002,61 +1905,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2002
1905
  *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2003
1906
 
2004
1907
  #else
2005
-
2006
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2007
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2008
-
2009
- int8_t aux8[QK_K];
2010
- int16_t aux16[8];
2011
- float sums [8];
2012
- int32_t aux32[8];
2013
- memset(sums, 0, 8*sizeof(float));
2014
-
2015
- float sumf = 0;
2016
- for (int i = 0; i < nb; ++i) {
2017
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2018
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2019
- memset(aux32, 0, 8*sizeof(int32_t));
2020
- int8_t * GGML_RESTRICT a = aux8;
2021
- for (int j = 0; j < QK_K/64; ++j) {
2022
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2023
- a += 32;
2024
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2025
- a += 32; q4 += 32;
2026
- }
2027
- memcpy(utmp, x[i].scales, 12);
2028
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2029
- const uint32_t uaux = utmp[1] & kmask1;
2030
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2031
- utmp[2] = uaux;
2032
- utmp[0] &= kmask1;
2033
-
2034
- int sumi = 0;
2035
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2036
- a = aux8;
2037
- int is = 0;
2038
- for (int j = 0; j < QK_K/32; ++j) {
2039
- int32_t scale = scales[is++];
2040
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2041
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2042
- q8 += 8; a += 8;
2043
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2044
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2045
- q8 += 8; a += 8;
2046
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2047
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2048
- q8 += 8; a += 8;
2049
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2050
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2051
- q8 += 8; a += 8;
2052
- }
2053
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2054
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2055
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2056
- sumf -= dmin * sumi;
2057
- }
2058
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2059
- *s = sumf;
1908
+ UNUSED(x);
1909
+ UNUSED(y);
1910
+ UNUSED(nb);
1911
+ UNUSED(kmask1);
1912
+ UNUSED(kmask2);
1913
+ UNUSED(kmask3);
1914
+ UNUSED(utmp);
1915
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2060
1916
  #endif
2061
1917
  }
2062
1918
 
@@ -2259,66 +2115,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2259
2115
  *s = hsum_float_8(acc) + summs;
2260
2116
 
2261
2117
  #else
2262
-
2263
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2264
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2265
-
2266
- int8_t aux8[QK_K];
2267
- int16_t aux16[8];
2268
- float sums [8];
2269
- int32_t aux32[8];
2270
- memset(sums, 0, 8*sizeof(float));
2271
-
2272
- float sumf = 0;
2273
- for (int i = 0; i < nb; ++i) {
2274
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2275
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
2276
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2277
- memset(aux32, 0, 8*sizeof(int32_t));
2278
- int8_t * GGML_RESTRICT a = aux8;
2279
- uint8_t m = 1;
2280
- for (int j = 0; j < QK_K/64; ++j) {
2281
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2282
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2283
- a += 32; m <<= 1;
2284
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2285
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2286
- a += 32; m <<= 1;
2287
- q4 += 32;
2288
- }
2289
- memcpy(utmp, x[i].scales, 12);
2290
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2291
- const uint32_t uaux = utmp[1] & kmask1;
2292
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2293
- utmp[2] = uaux;
2294
- utmp[0] &= kmask1;
2295
-
2296
- int sumi = 0;
2297
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2298
- a = aux8;
2299
- int is = 0;
2300
- for (int j = 0; j < QK_K/32; ++j) {
2301
- int32_t scale = scales[is++];
2302
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2303
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2304
- q8 += 8; a += 8;
2305
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2306
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2307
- q8 += 8; a += 8;
2308
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2309
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2310
- q8 += 8; a += 8;
2311
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2312
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2313
- q8 += 8; a += 8;
2314
- }
2315
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2316
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2317
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2318
- sumf -= dmin * sumi;
2319
- }
2320
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2321
- *s = sumf;
2118
+ UNUSED(x);
2119
+ UNUSED(y);
2120
+ UNUSED(nb);
2121
+ UNUSED(kmask1);
2122
+ UNUSED(kmask2);
2123
+ UNUSED(kmask3);
2124
+ UNUSED(utmp);
2125
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2322
2126
  #endif
2323
2127
  }
2324
2128
 
@@ -2520,47 +2324,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2520
2324
  *s = hsum_float_8(acc);
2521
2325
 
2522
2326
  #else
2523
-
2524
- int8_t aux8[QK_K];
2525
- int16_t aux16[8];
2526
- float sums [8];
2527
- int32_t aux32[8];
2528
- memset(sums, 0, 8*sizeof(float));
2529
-
2530
- float sumf = 0;
2531
- for (int i = 0; i < nb; ++i) {
2532
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2533
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2534
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2535
- memset(aux32, 0, 8*sizeof(int32_t));
2536
- int8_t * GGML_RESTRICT a = aux8;
2537
- for (int j = 0; j < QK_K; j += 128) {
2538
- for (int l = 0; l < 32; ++l) {
2539
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2540
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2541
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2542
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2543
- }
2544
- a += 128;
2545
- q4 += 64;
2546
- qh += 32;
2547
- }
2548
- a = aux8;
2549
- int is = 0;
2550
- for (int j = 0; j < QK_K/16; ++j) {
2551
- int scale = x[i].scales[is++];
2552
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2553
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2554
- q8 += 8; a += 8;
2555
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2556
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2557
- q8 += 8; a += 8;
2558
- }
2559
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2560
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2561
- }
2562
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2563
- *s = sumf;
2327
+ UNUSED(x);
2328
+ UNUSED(y);
2329
+ UNUSED(nb);
2330
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2564
2331
  #endif
2565
2332
  }
2566
2333
 
@@ -2712,34 +2479,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2712
2479
  *s = 0.125f * hsum_float_8(accumf);
2713
2480
 
2714
2481
  #else
2715
-
2716
- uint32_t aux32[2];
2717
- const uint8_t * aux8 = (const uint8_t *)aux32;
2718
-
2719
- float sumf = 0.f;
2720
- for (int i = 0; i < nb; ++i) {
2721
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2722
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2723
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2724
- int32_t bsum = 0;
2725
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2726
- memcpy(aux32, q2, 2*sizeof(uint32_t));
2727
- q2 += 4;
2728
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
2729
- int32_t sumi = 0;
2730
- for (int l = 0; l < 4; ++l) {
2731
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
2732
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
2733
- for (int j = 0; j < 8; ++j) {
2734
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2735
- }
2736
- q8 += 8;
2737
- }
2738
- bsum += sumi * ls;
2739
- }
2740
- sumf += d * bsum;
2741
- }
2742
- *s = 0.125f * sumf;
2482
+ UNUSED(x);
2483
+ UNUSED(y);
2484
+ UNUSED(nb);
2485
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2743
2486
  #endif
2744
2487
  }
2745
2488
 
@@ -3033,42 +2776,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3033
2776
  *s = 0.125f * hsum_float_8(accumf);
3034
2777
 
3035
2778
  #else
3036
-
3037
- float sumf = 0.f;
3038
- for (int i = 0; i < nb; ++i) {
3039
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3040
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3041
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
3042
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3043
- int32_t bsum = 0;
3044
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3045
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
3046
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
3047
- int32_t sumi = 0;
3048
- for (int l = 0; l < 2; ++l) {
3049
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3050
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3051
- for (int j = 0; j < 8; ++j) {
3052
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3053
- }
3054
- q8 += 8;
3055
- }
3056
- bsum += sumi * ls1;
3057
- sumi = 0;
3058
- for (int l = 2; l < 4; ++l) {
3059
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3060
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3061
- for (int j = 0; j < 8; ++j) {
3062
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3063
- }
3064
- q8 += 8;
3065
- }
3066
- bsum += sumi * ls2;
3067
- q2 += 4;
3068
- }
3069
- sumf += d * bsum;
3070
- }
3071
- *s = 0.125f * sumf;
2779
+ UNUSED(x);
2780
+ UNUSED(y);
2781
+ UNUSED(nb);
2782
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3072
2783
  #endif
3073
2784
  }
3074
2785
 
@@ -3250,47 +2961,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3250
2961
  *s = 0.125f * hsum_float_8(accumf);
3251
2962
 
3252
2963
  #else
3253
-
3254
- float sumf = 0;
3255
- for (int i = 0; i < nb; i++) {
3256
-
3257
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3258
- const int8_t * q8 = y[i].qs;
3259
- const uint8_t * qs = x[i].qs;
3260
- const uint8_t * qh = x[i].qh;
3261
- const uint8_t * signs = qs + QK_K/8;
3262
-
3263
- int bsum = 0;
3264
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3265
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
3266
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
3267
- int sumi1 = 0, sumi2 = 0;
3268
- for (int l = 0; l < 2; ++l) {
3269
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3270
- for (int j = 0; j < 8; ++j) {
3271
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3272
- }
3273
- q8 += 8;
3274
- }
3275
- for (int l = 2; l < 4; ++l) {
3276
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3277
- for (int j = 0; j < 8; ++j) {
3278
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3279
- }
3280
- q8 += 8;
3281
- }
3282
- bsum += ls1 * sumi1 + ls2 * sumi2;
3283
- qs += 4;
3284
- signs += 4;
3285
- }
3286
-
3287
- sumf += d * bsum;
3288
- }
3289
-
3290
- *s = 0.125f * sumf;
3291
-
2964
+ UNUSED(x);
2965
+ UNUSED(y);
2966
+ UNUSED(nb);
2967
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3292
2968
  #endif
3293
-
3294
2969
  }
3295
2970
 
3296
2971
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -3410,36 +3085,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3410
3085
  *s = 0.25f * hsum_float_8(accumf);
3411
3086
 
3412
3087
  #else
3413
-
3414
- uint32_t aux32;
3415
-
3416
- float sumf = 0.f;
3417
- for (int i = 0; i < nb; ++i) {
3418
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3419
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3420
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3421
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3422
- int32_t bsum = 0;
3423
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3424
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
3425
- const uint32_t ls = 2*(aux32 >> 28) + 1;
3426
- int32_t sumi = 0;
3427
- for (int l = 0; l < 4; ++l) {
3428
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
3429
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
3430
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
3431
- for (int j = 0; j < 4; ++j) {
3432
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
3433
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
3434
- }
3435
- q8 += 8;
3436
- }
3437
- q3 += 8;
3438
- bsum += sumi * ls;
3439
- }
3440
- sumf += d * bsum;
3441
- }
3442
- *s = 0.25f * sumf;
3088
+ UNUSED(x);
3089
+ UNUSED(y);
3090
+ UNUSED(nb);
3091
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3443
3092
  #endif
3444
3093
  }
3445
3094
 
@@ -3646,59 +3295,13 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3646
3295
  *s = hsum_float_8(accumf);
3647
3296
 
3648
3297
  #else
3649
-
3650
- float sumf = 0.f;
3651
- for (int i = 0; i < nb; ++i) {
3652
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3653
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
3654
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
3655
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
3656
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3657
- int32_t bsum = 0;
3658
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3659
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
3660
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
3661
- int32_t sumi = 0;
3662
- for (int l = 0; l < 4; ++l) {
3663
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
3664
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
3665
- for (int j = 0; j < 4; ++j) {
3666
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3667
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3668
- }
3669
- q8 += 8;
3670
- }
3671
- qs += 8;
3672
- signs += 4;
3673
- bsum += sumi * ls1;
3674
- sumi = 0;
3675
- for (int l = 0; l < 4; ++l) {
3676
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
3677
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
3678
- for (int j = 0; j < 4; ++j) {
3679
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3680
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3681
- }
3682
- q8 += 8;
3683
- }
3684
- qs += 8;
3685
- signs += 4;
3686
- bsum += sumi * ls2;
3687
- }
3688
- sumf += d * bsum;
3689
- }
3690
- *s = sumf;
3298
+ UNUSED(x);
3299
+ UNUSED(y);
3300
+ UNUSED(nb);
3301
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3691
3302
  #endif
3692
3303
  }
3693
3304
 
3694
- #if defined(__AVX2__)
3695
- static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
3696
- const __m256i ax = _mm256_sign_epi8(x, x);
3697
- const __m256i sy = _mm256_sign_epi8(y, x);
3698
- return _mm256_maddubs_epi16(ax, sy);
3699
- }
3700
- #endif
3701
-
3702
3305
  void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3703
3306
  assert(n % QK_K == 0);
3704
3307
  assert(nrc == 1);
@@ -3811,36 +3414,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3811
3414
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
3812
3415
 
3813
3416
  #else
3814
-
3815
- float sumf = 0;
3816
- for (int i = 0; i < nb; i++) {
3817
-
3818
- const int8_t * q8 = y[i].qs;
3819
- const uint8_t * qs = x[i].qs;
3820
- const uint16_t * qh = x[i].qh;
3821
-
3822
- int sumi = 0, sumi1 = 0;
3823
- for (int ib = 0; ib < QK_K/32; ++ib) {
3824
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
3825
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
3826
- int lsum = 0;
3827
- for (int l = 0; l < 4; ++l) {
3828
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
3829
- for (int j = 0; j < 8; ++j) {
3830
- lsum += q8[j] * grid[j];
3831
- }
3832
- q8 += 8;
3833
- }
3834
- sumi += ls * lsum;
3835
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
3836
- qs += 4;
3837
- }
3838
-
3839
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3840
- }
3841
-
3842
- *s = sumf;
3843
-
3417
+ UNUSED(x);
3418
+ UNUSED(y);
3419
+ UNUSED(nb);
3420
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3844
3421
  #endif
3845
3422
  }
3846
3423
 
@@ -4043,52 +3620,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
4043
3620
  *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
4044
3621
 
4045
3622
  #else
4046
-
4047
- int sum1[2], sum2[2], delta[4];
4048
-
4049
- float sumf = 0;
4050
- for (int i = 0; i < nb; i++) {
4051
-
4052
- const int8_t * q8 = y[i].qs;
4053
- const uint8_t * qs = x[i].qs;
4054
- const uint8_t * qh = x[i].qh;
4055
- const uint16_t * sc = (const uint16_t *)x[i].scales;
4056
-
4057
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
4058
-
4059
- int sumi1 = 0, sumi2 = 0;
4060
- for (int ib = 0; ib < QK_K/32; ++ib) {
4061
- delta[0] = qh[0] & 0x08 ? -1 : 1;
4062
- delta[1] = qh[0] & 0x80 ? -1 : 1;
4063
- delta[2] = qh[1] & 0x08 ? -1 : 1;
4064
- delta[3] = qh[1] & 0x80 ? -1 : 1;
4065
- sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
4066
- for (int l = 0; l < 4; ++l) {
4067
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
4068
- int lsum1 = 0, lsum2 = 0;
4069
- for (int j = 0; j < 8; ++j) {
4070
- lsum1 += q8[j] * grid[j];
4071
- lsum2 += q8[j];
4072
- }
4073
- q8 += 8;
4074
- sum1[l/2] += lsum1;
4075
- sum2[l/2] += lsum2*delta[l];
4076
- }
4077
-
4078
- const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
4079
- const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
4080
-
4081
- sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
4082
- sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
4083
- qs += 4;
4084
- qh += 2;
4085
- }
4086
-
4087
- sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
4088
- }
4089
-
4090
- *s = sumf;
4091
-
3623
+ UNUSED(x);
3624
+ UNUSED(y);
3625
+ UNUSED(nb);
3626
+ UNUSED(scale);
3627
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4092
3628
  #endif
4093
3629
  }
4094
3630
 
@@ -4275,37 +3811,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4275
3811
  *s = hsum_float_8(accum);
4276
3812
 
4277
3813
  #else
4278
- float sumf = 0;
4279
- for (int ibl = 0; ibl < nb; ++ibl) {
4280
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4281
- uint16_t h = x[ibl].scales_h;
4282
- const uint8_t * qs = x[ibl].qs;
4283
- const int8_t * q8 = y[ibl].qs;
4284
- for (int ib = 0; ib < QK_K/32; ib += 2) {
4285
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
4286
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
4287
- h >>= 4;
4288
- const float d1 = d4d8*(ls1 - 32);
4289
- const float d2 = d4d8*(ls2 - 32);
4290
- int sumi1 = 0, sumi2 = 0;
4291
- for (int j = 0; j < 16; ++j) {
4292
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4293
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4294
- }
4295
- sumf += d1 * (sumi1 + sumi2);
4296
- qs += 16;
4297
- q8 += 32;
4298
- sumi1 = sumi2 = 0;
4299
- for (int j = 0; j < 16; ++j) {
4300
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4301
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4302
- }
4303
- sumf += d2 * (sumi1 + sumi2);
4304
- qs += 16;
4305
- q8 += 32;
4306
- }
4307
- }
4308
- *s = sumf;
3814
+ UNUSED(x);
3815
+ UNUSED(y);
3816
+ UNUSED(nb);
3817
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4309
3818
  #endif
4310
3819
  }
4311
3820