whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
105
105
 
106
106
  return ((v4f32)res)[0];
107
107
  }
108
+
109
+ // multiply int8_t, add results pairwise twice
110
+ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
111
+ // Get absolute values of x vectors
112
+ const __m128i ax = __lsx_vsigncov_b(x, x);
113
+ // Sign the values of the y vectors
114
+ const __m128i sy = __lsx_vsigncov_b(x, y);
115
+ // Perform multiplication and create 16-bit values
116
+ const __m128i dot = lsx_maddubs_h(ax, sy);
117
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
118
+ return lsx_madd_h(ones, dot);
119
+ }
108
120
  #endif
109
121
 
110
122
  #if defined(__loongarch_asx)
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
323
335
  }
324
336
  }
325
337
 
326
- // multiply int8_t, add results pairwise twice
327
- static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
328
- // Get absolute values of x vectors
329
- const __m128i ax = __lsx_vsigncov_b(x, x);
330
- // Sign the values of the y vectors
331
- const __m128i sy = __lsx_vsigncov_b(x, y);
332
- // Perform multiplication and create 16-bit values
333
- const __m128i dot = lsx_maddubs_h(ax, sy);
334
- const __m128i ones = __lsx_vreplgr2vr_h(1);
335
- return lsx_madd_h(ones, dot);
336
- }
337
-
338
338
  // horizontally add 8 floats
339
339
  static inline float hsum_float_8(const __m256 x) {
340
340
  __m128 res = lasx_extractf128(x, 1);
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
544
544
  __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
545
545
  max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
546
546
  __m128 tmp = max4;
547
- max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
547
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
548
548
  const float max_scalar = ((v4f32)max4)[0];
549
549
 
550
550
  // Quantize these floats
@@ -821,24 +821,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
821
821
 
822
822
  sumf = hsum_float_8(acc) + summs;
823
823
 
824
- #endif
825
- for (; ib < nb; ++ib) {
826
- int sumi0 = 0;
827
- int sumi1 = 0;
828
-
829
- for (int j = 0; j < qk/2; ++j) {
830
- const int v0 = (x[ib].qs[j] & 0x0F);
831
- const int v1 = (x[ib].qs[j] >> 4);
832
-
833
- sumi0 += (v0 * y[ib].qs[j]);
834
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
835
- }
836
-
837
- int sumi = sumi0 + sumi1;
838
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
839
- }
840
-
841
824
  *s = sumf;
825
+ #else
826
+ UNUSED(nb);
827
+ UNUSED(x);
828
+ UNUSED(y);
829
+ UNUSED(ib);
830
+ UNUSED(sumf);
831
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
832
+ #endif
842
833
  }
843
834
 
844
835
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -883,30 +874,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
883
874
 
884
875
  sumf = hsum_float_8(acc);
885
876
 
886
- #endif
887
- for (; ib < nb; ++ib) {
888
- uint32_t qh;
889
- memcpy(&qh, x[ib].qh, sizeof(qh));
890
-
891
- int sumi0 = 0;
892
- int sumi1 = 0;
893
-
894
- for (int j = 0; j < qk/2; ++j) {
895
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
896
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
897
-
898
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
899
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
900
-
901
- sumi0 += (x0 * y[ib].qs[j]);
902
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
903
- }
904
-
905
- int sumi = sumi0 + sumi1;
906
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
907
- }
908
-
909
877
  *s = sumf;
878
+ #else
879
+ UNUSED(nb);
880
+ UNUSED(ib);
881
+ UNUSED(sumf);
882
+ UNUSED(x);
883
+ UNUSED(y);
884
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
885
+ #endif
910
886
  }
911
887
 
912
888
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -954,30 +930,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
954
930
 
955
931
  sumf = hsum_float_8(acc) + summs;
956
932
 
957
- #endif
958
- for (; ib < nb; ++ib) {
959
- uint32_t qh;
960
- memcpy(&qh, x[ib].qh, sizeof(qh));
961
-
962
- int sumi0 = 0;
963
- int sumi1 = 0;
964
-
965
- for (int j = 0; j < qk/2; ++j) {
966
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
967
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
968
-
969
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
970
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
971
-
972
- sumi0 += (x0 * y[ib].qs[j]);
973
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
974
- }
975
-
976
- int sumi = sumi0 + sumi1;
977
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
978
- }
979
-
980
933
  *s = sumf;
934
+ #else
935
+ UNUSED(nb);
936
+ UNUSED(ib);
937
+ UNUSED(sumf);
938
+ UNUSED(x);
939
+ UNUSED(y);
940
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
941
+ #endif
981
942
  }
982
943
 
983
944
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1016,18 +977,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1016
977
 
1017
978
  sumf = hsum_float_8(acc);
1018
979
 
1019
- #endif
1020
- for (; ib < nb; ++ib) {
1021
- int sumi = 0;
1022
-
1023
- for (int j = 0; j < qk; j++) {
1024
- sumi += x[ib].qs[j]*y[ib].qs[j];
1025
- }
1026
-
1027
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1028
- }
1029
-
1030
980
  *s = sumf;
981
+ #else
982
+ UNUSED(nb);
983
+ UNUSED(ib);
984
+ UNUSED(sumf);
985
+ UNUSED(x);
986
+ UNUSED(y);
987
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
988
+ #endif
1031
989
  }
1032
990
 
1033
991
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1103,45 +1061,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1103
1061
  *s = hsum_float_8(acc);
1104
1062
 
1105
1063
  #else
1106
-
1107
- float sumf = 0;
1108
-
1109
- for (int i = 0; i < nb; ++i) {
1110
-
1111
- const uint8_t * q2 = x[i].qs;
1112
- const int8_t * q8 = y[i].qs;
1113
- const uint8_t * sc = x[i].scales;
1114
-
1115
- int summs = 0;
1116
- for (int j = 0; j < 16; ++j) {
1117
- summs += y[i].bsums[j] * (sc[j] >> 4);
1118
- }
1119
-
1120
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1121
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1122
-
1123
- int isum = 0;
1124
- int is = 0;
1125
- int d;
1126
- for (int k = 0; k < QK_K/128; ++k) {
1127
- int shift = 0;
1128
- for (int j = 0; j < 4; ++j) {
1129
- d = sc[is++] & 0xF;
1130
- int isuml = 0;
1131
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1132
- isum += d * isuml;
1133
- d = sc[is++] & 0xF;
1134
- isuml = 0;
1135
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1136
- isum += d * isuml;
1137
- shift += 2;
1138
- q8 += 32;
1139
- }
1140
- q2 += 32;
1141
- }
1142
- sumf += dall * isum - dmin * summs;
1143
- }
1144
- *s = sumf;
1064
+ UNUSED(x);
1065
+ UNUSED(y);
1066
+ UNUSED(nb);
1067
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1145
1068
  #endif
1146
1069
  }
1147
1070
 
@@ -1239,70 +1162,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1239
1162
  *s = hsum_float_8(acc);
1240
1163
 
1241
1164
  #else
1242
- // scalar version
1243
- // This function is written like this so the compiler can manage to vectorize most of it
1244
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1245
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1246
- // The ideal situation would be if we could just write the code once, and the compiler would
1247
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1248
- // write vectorized versions for AVX, ARM_NEON, etc.
1249
-
1250
- int8_t aux8[QK_K];
1251
- int16_t aux16[8];
1252
- float sums [8];
1253
- int32_t aux32[8];
1254
- memset(sums, 0, 8*sizeof(float));
1255
-
1256
- uint32_t auxs[4];
1257
- const int8_t * scales = (const int8_t*)auxs;
1258
-
1259
- float sumf = 0;
1260
- for (int i = 0; i < nb; ++i) {
1261
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1262
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1263
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1264
- memset(aux32, 0, 8*sizeof(int32_t));
1265
- int8_t * GGML_RESTRICT a = aux8;
1266
- uint8_t m = 1;
1267
- for (int j = 0; j < QK_K; j += 128) {
1268
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1269
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1270
- a += 32; m <<= 1;
1271
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1272
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1273
- a += 32; m <<= 1;
1274
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1275
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1276
- a += 32; m <<= 1;
1277
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1278
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1279
- a += 32; m <<= 1;
1280
- q3 += 32;
1281
- }
1282
- a = aux8;
1283
-
1284
- memcpy(auxs, x[i].scales, 12);
1285
- uint32_t tmp = auxs[2];
1286
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1287
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1288
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1289
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1290
- for (int j = 0; j < QK_K/16; ++j) {
1291
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1292
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1293
- q8 += 8; a += 8;
1294
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1295
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1296
- q8 += 8; a += 8;
1297
- }
1298
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1299
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1300
- }
1301
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1302
- *s = sumf;
1303
-
1165
+ UNUSED(kmask1);
1166
+ UNUSED(kmask2);
1167
+ UNUSED(x);
1168
+ UNUSED(y);
1169
+ UNUSED(nb);
1170
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1304
1171
  #endif
1305
-
1306
1172
  }
1307
1173
 
1308
1174
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1391,61 +1257,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1391
1257
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1392
1258
 
1393
1259
  #else
1394
-
1395
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1396
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1397
-
1398
- int8_t aux8[QK_K];
1399
- int16_t aux16[8];
1400
- float sums [8];
1401
- int32_t aux32[8];
1402
- memset(sums, 0, 8*sizeof(float));
1403
-
1404
- float sumf = 0;
1405
- for (int i = 0; i < nb; ++i) {
1406
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1407
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1408
- memset(aux32, 0, 8*sizeof(int32_t));
1409
- int8_t * GGML_RESTRICT a = aux8;
1410
- for (int j = 0; j < QK_K/64; ++j) {
1411
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1412
- a += 32;
1413
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1414
- a += 32; q4 += 32;
1415
- }
1416
- memcpy(utmp, x[i].scales, 12);
1417
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1418
- const uint32_t uaux = utmp[1] & kmask1;
1419
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1420
- utmp[2] = uaux;
1421
- utmp[0] &= kmask1;
1422
-
1423
- int sumi = 0;
1424
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1425
- a = aux8;
1426
- int is = 0;
1427
- for (int j = 0; j < QK_K/32; ++j) {
1428
- int32_t scale = scales[is++];
1429
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1430
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1431
- q8 += 8; a += 8;
1432
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1433
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1434
- q8 += 8; a += 8;
1435
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1436
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1437
- q8 += 8; a += 8;
1438
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1439
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1440
- q8 += 8; a += 8;
1441
- }
1442
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1443
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1444
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1445
- sumf -= dmin * sumi;
1446
- }
1447
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1448
- *s = sumf;
1260
+ UNUSED(x);
1261
+ UNUSED(y);
1262
+ UNUSED(nb);
1263
+ UNUSED(kmask1);
1264
+ UNUSED(kmask2);
1265
+ UNUSED(kmask3);
1266
+ UNUSED(utmp);
1267
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1449
1268
  #endif
1450
1269
  }
1451
1270
 
@@ -1541,66 +1360,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1541
1360
  *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1542
1361
 
1543
1362
  #else
1544
-
1545
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1546
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1547
-
1548
- int8_t aux8[QK_K];
1549
- int16_t aux16[8];
1550
- float sums [8];
1551
- int32_t aux32[8];
1552
- memset(sums, 0, 8*sizeof(float));
1553
-
1554
- float sumf = 0;
1555
- for (int i = 0; i < nb; ++i) {
1556
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1557
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1558
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1559
- memset(aux32, 0, 8*sizeof(int32_t));
1560
- int8_t * GGML_RESTRICT a = aux8;
1561
- uint8_t m = 1;
1562
- for (int j = 0; j < QK_K/64; ++j) {
1563
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1564
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1565
- a += 32; m <<= 1;
1566
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1567
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1568
- a += 32; m <<= 1;
1569
- q4 += 32;
1570
- }
1571
- memcpy(utmp, x[i].scales, 12);
1572
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1573
- const uint32_t uaux = utmp[1] & kmask1;
1574
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1575
- utmp[2] = uaux;
1576
- utmp[0] &= kmask1;
1577
-
1578
- int sumi = 0;
1579
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1580
- a = aux8;
1581
- int is = 0;
1582
- for (int j = 0; j < QK_K/32; ++j) {
1583
- int32_t scale = scales[is++];
1584
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1585
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1586
- q8 += 8; a += 8;
1587
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1588
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1589
- q8 += 8; a += 8;
1590
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1591
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1592
- q8 += 8; a += 8;
1593
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1594
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1595
- q8 += 8; a += 8;
1596
- }
1597
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1598
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1599
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1600
- sumf -= dmin * sumi;
1601
- }
1602
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1603
- *s = sumf;
1363
+ UNUSED(x);
1364
+ UNUSED(y);
1365
+ UNUSED(nb);
1366
+ UNUSED(kmask1);
1367
+ UNUSED(kmask2);
1368
+ UNUSED(kmask3);
1369
+ UNUSED(utmp);
1370
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1604
1371
  #endif
1605
1372
  }
1606
1373
 
@@ -1678,47 +1445,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1678
1445
  *s = hsum_float_8(acc);
1679
1446
 
1680
1447
  #else
1681
-
1682
- int8_t aux8[QK_K];
1683
- int16_t aux16[8];
1684
- float sums [8];
1685
- int32_t aux32[8];
1686
- memset(sums, 0, 8*sizeof(float));
1687
-
1688
- float sumf = 0;
1689
- for (int i = 0; i < nb; ++i) {
1690
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1691
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
1692
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1693
- memset(aux32, 0, 8*sizeof(int32_t));
1694
- int8_t * GGML_RESTRICT a = aux8;
1695
- for (int j = 0; j < QK_K; j += 128) {
1696
- for (int l = 0; l < 32; ++l) {
1697
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1698
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1699
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1700
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1701
- }
1702
- a += 128;
1703
- q4 += 64;
1704
- qh += 32;
1705
- }
1706
- a = aux8;
1707
- int is = 0;
1708
- for (int j = 0; j < QK_K/16; ++j) {
1709
- int scale = x[i].scales[is++];
1710
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1711
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1712
- q8 += 8; a += 8;
1713
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1714
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1715
- q8 += 8; a += 8;
1716
- }
1717
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1718
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1719
- }
1720
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1721
- *s = sumf;
1448
+ UNUSED(x);
1449
+ UNUSED(y);
1450
+ UNUSED(nb);
1451
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1722
1452
  #endif
1723
1453
  }
1724
1454
 
@@ -1815,34 +1545,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1815
1545
  *s = 0.125f * hsum_float_8(accumf);
1816
1546
 
1817
1547
  #else
1818
-
1819
- uint32_t aux32[2];
1820
- const uint8_t * aux8 = (const uint8_t *)aux32;
1821
-
1822
- float sumf = 0.f;
1823
- for (int i = 0; i < nb; ++i) {
1824
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1825
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1826
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1827
- int32_t bsum = 0;
1828
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1829
- memcpy(aux32, q2, 2*sizeof(uint32_t));
1830
- q2 += 4;
1831
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1832
- int32_t sumi = 0;
1833
- for (int l = 0; l < 4; ++l) {
1834
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1835
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1836
- for (int j = 0; j < 8; ++j) {
1837
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1838
- }
1839
- q8 += 8;
1840
- }
1841
- bsum += sumi * ls;
1842
- }
1843
- sumf += d * bsum;
1844
- }
1845
- *s = 0.125f * sumf;
1548
+ UNUSED(x);
1549
+ UNUSED(y);
1550
+ UNUSED(nb);
1551
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1846
1552
  #endif
1847
1553
  }
1848
1554
 
@@ -1978,42 +1684,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1978
1684
  *s = 0.125f * hsum_float_8(accumf);
1979
1685
 
1980
1686
  #else
1981
-
1982
- float sumf = 0.f;
1983
- for (int i = 0; i < nb; ++i) {
1984
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1985
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1986
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
1987
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1988
- int32_t bsum = 0;
1989
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1990
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
1991
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
1992
- int32_t sumi = 0;
1993
- for (int l = 0; l < 2; ++l) {
1994
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1995
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1996
- for (int j = 0; j < 8; ++j) {
1997
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1998
- }
1999
- q8 += 8;
2000
- }
2001
- bsum += sumi * ls1;
2002
- sumi = 0;
2003
- for (int l = 2; l < 4; ++l) {
2004
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
2005
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
2006
- for (int j = 0; j < 8; ++j) {
2007
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2008
- }
2009
- q8 += 8;
2010
- }
2011
- bsum += sumi * ls2;
2012
- q2 += 4;
2013
- }
2014
- sumf += d * bsum;
2015
- }
2016
- *s = 0.125f * sumf;
1687
+ UNUSED(x);
1688
+ UNUSED(y);
1689
+ UNUSED(nb);
1690
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2017
1691
  #endif
2018
1692
  }
2019
1693
 
@@ -2105,47 +1779,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2105
1779
  *s = 0.125f * hsum_float_8(accumf);
2106
1780
 
2107
1781
  #else
2108
-
2109
- float sumf = 0;
2110
- for (int i = 0; i < nb; i++) {
2111
-
2112
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2113
- const int8_t * q8 = y[i].qs;
2114
- const uint8_t * qs = x[i].qs;
2115
- const uint8_t * qh = x[i].qh;
2116
- const uint8_t * signs = qs + QK_K/8;
2117
-
2118
- int bsum = 0;
2119
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2120
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
2121
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
2122
- int sumi1 = 0, sumi2 = 0;
2123
- for (int l = 0; l < 2; ++l) {
2124
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2125
- for (int j = 0; j < 8; ++j) {
2126
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2127
- }
2128
- q8 += 8;
2129
- }
2130
- for (int l = 2; l < 4; ++l) {
2131
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2132
- for (int j = 0; j < 8; ++j) {
2133
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2134
- }
2135
- q8 += 8;
2136
- }
2137
- bsum += ls1 * sumi1 + ls2 * sumi2;
2138
- qs += 4;
2139
- signs += 4;
2140
- }
2141
-
2142
- sumf += d * bsum;
2143
- }
2144
-
2145
- *s = 0.125f * sumf;
2146
-
1782
+ UNUSED(x);
1783
+ UNUSED(y);
1784
+ UNUSED(nb);
1785
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2147
1786
  #endif
2148
-
2149
1787
  }
2150
1788
 
2151
1789
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2209,36 +1847,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2209
1847
  *s = 0.25f * hsum_float_8(accumf);
2210
1848
 
2211
1849
  #else
2212
-
2213
- uint32_t aux32;
2214
-
2215
- float sumf = 0.f;
2216
- for (int i = 0; i < nb; ++i) {
2217
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2218
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2219
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2220
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2221
- int32_t bsum = 0;
2222
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2223
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
2224
- const uint32_t ls = 2*(aux32 >> 28) + 1;
2225
- int32_t sumi = 0;
2226
- for (int l = 0; l < 4; ++l) {
2227
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
2228
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
2229
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
2230
- for (int j = 0; j < 4; ++j) {
2231
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
2232
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
2233
- }
2234
- q8 += 8;
2235
- }
2236
- q3 += 8;
2237
- bsum += sumi * ls;
2238
- }
2239
- sumf += d * bsum;
2240
- }
2241
- *s = 0.25f * sumf;
1850
+ UNUSED(x);
1851
+ UNUSED(y);
1852
+ UNUSED(nb);
1853
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2242
1854
  #endif
2243
1855
  }
2244
1856
 
@@ -2338,48 +1950,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2338
1950
  *s = hsum_float_8(accumf);
2339
1951
 
2340
1952
  #else
2341
-
2342
- float sumf = 0.f;
2343
- for (int i = 0; i < nb; ++i) {
2344
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2345
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
2346
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2347
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
2348
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2349
- int32_t bsum = 0;
2350
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2351
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
2352
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
2353
- int32_t sumi = 0;
2354
- for (int l = 0; l < 4; ++l) {
2355
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
2356
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
2357
- for (int j = 0; j < 4; ++j) {
2358
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2359
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2360
- }
2361
- q8 += 8;
2362
- }
2363
- qs += 8;
2364
- signs += 4;
2365
- bsum += sumi * ls1;
2366
- sumi = 0;
2367
- for (int l = 0; l < 4; ++l) {
2368
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
2369
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
2370
- for (int j = 0; j < 4; ++j) {
2371
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2372
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2373
- }
2374
- q8 += 8;
2375
- }
2376
- qs += 8;
2377
- signs += 4;
2378
- bsum += sumi * ls2;
2379
- }
2380
- sumf += d * bsum;
2381
- }
2382
- *s = sumf;
1953
+ UNUSED(x);
1954
+ UNUSED(y);
1955
+ UNUSED(nb);
1956
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2383
1957
  #endif
2384
1958
  }
2385
1959
 
@@ -2460,36 +2034,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2460
2034
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
2461
2035
 
2462
2036
  #else
2463
-
2464
- float sumf = 0;
2465
- for (int i = 0; i < nb; i++) {
2466
-
2467
- const int8_t * q8 = y[i].qs;
2468
- const uint8_t * qs = x[i].qs;
2469
- const uint16_t * qh = x[i].qh;
2470
-
2471
- int sumi = 0, sumi1 = 0;
2472
- for (int ib = 0; ib < QK_K/32; ++ib) {
2473
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
2474
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
2475
- int lsum = 0;
2476
- for (int l = 0; l < 4; ++l) {
2477
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
2478
- for (int j = 0; j < 8; ++j) {
2479
- lsum += q8[j] * grid[j];
2480
- }
2481
- q8 += 8;
2482
- }
2483
- sumi += ls * lsum;
2484
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
2485
- qs += 4;
2486
- }
2487
-
2488
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2489
- }
2490
-
2491
- *s = sumf;
2492
-
2037
+ UNUSED(x);
2038
+ UNUSED(y);
2039
+ UNUSED(nb);
2040
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2493
2041
  #endif
2494
2042
  }
2495
2043
 
@@ -2603,37 +2151,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2603
2151
  *s = hsum_float_8(accum);
2604
2152
 
2605
2153
  #else
2606
- float sumf = 0;
2607
- for (int ibl = 0; ibl < nb; ++ibl) {
2608
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2609
- uint16_t h = x[ibl].scales_h;
2610
- const uint8_t * qs = x[ibl].qs;
2611
- const int8_t * q8 = y[ibl].qs;
2612
- for (int ib = 0; ib < QK_K/32; ib += 2) {
2613
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
2614
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
2615
- h >>= 4;
2616
- const float d1 = d4d8*(ls1 - 32);
2617
- const float d2 = d4d8*(ls2 - 32);
2618
- int sumi1 = 0, sumi2 = 0;
2619
- for (int j = 0; j < 16; ++j) {
2620
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2621
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2622
- }
2623
- sumf += d1 * (sumi1 + sumi2);
2624
- qs += 16;
2625
- q8 += 32;
2626
- sumi1 = sumi2 = 0;
2627
- for (int j = 0; j < 16; ++j) {
2628
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2629
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2630
- }
2631
- sumf += d2 * (sumi1 + sumi2);
2632
- qs += 16;
2633
- q8 += 32;
2634
- }
2635
- }
2636
- *s = sumf;
2154
+ UNUSED(x);
2155
+ UNUSED(y);
2156
+ UNUSED(nb);
2157
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2637
2158
  #endif
2638
2159
  }
2639
2160