whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -116,6 +116,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
116
116
  //===================================== Dot products =================================
117
117
 
118
118
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
119
+ #if defined(__riscv_v)
119
120
  const int qk = QK8_0;
120
121
  const int nb = n / qk;
121
122
 
@@ -132,7 +133,6 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
132
133
  int ib = 0;
133
134
  float sumf = 0;
134
135
 
135
- #if defined(__riscv_v)
136
136
  size_t vl = qk / 2;
137
137
 
138
138
  for (; ib < nb; ++ib) {
@@ -164,27 +164,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
164
164
  sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
165
165
  }
166
166
 
167
- #endif
168
- for (; ib < nb; ++ib) {
169
- int sumi0 = 0;
170
- int sumi1 = 0;
171
-
172
- for (int j = 0; j < qk/2; ++j) {
173
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
174
- const int v1 = (x[ib].qs[j] >> 4) - 8;
175
-
176
- sumi0 += (v0 * y[ib].qs[j]);
177
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
178
- }
179
-
180
- int sumi = sumi0 + sumi1;
181
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
182
- }
183
-
184
167
  *s = sumf;
168
+ #else
169
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
170
+ #endif
185
171
  }
186
172
 
187
173
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
174
+ #if defined(__riscv_v)
188
175
  const int qk = QK8_1;
189
176
  const int nb = n / qk;
190
177
 
@@ -201,7 +188,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
201
188
  int ib = 0;
202
189
  float sumf = 0;
203
190
 
204
- #if defined(__riscv_v)
205
191
  size_t vl = qk / 2;
206
192
 
207
193
  for (; ib < nb; ++ib) {
@@ -229,27 +215,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
229
215
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
230
216
  }
231
217
 
232
- #endif
233
- for (; ib < nb; ++ib) {
234
- int sumi0 = 0;
235
- int sumi1 = 0;
236
-
237
- for (int j = 0; j < qk/2; ++j) {
238
- const int v0 = (x[ib].qs[j] & 0x0F);
239
- const int v1 = (x[ib].qs[j] >> 4);
240
-
241
- sumi0 += (v0 * y[ib].qs[j]);
242
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
243
- }
244
-
245
- int sumi = sumi0 + sumi1;
246
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
247
- }
248
-
249
218
  *s = sumf;
219
+ #else
220
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
221
+ #endif
250
222
  }
251
223
 
252
224
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
225
+ #if defined(__riscv_v)
253
226
  const int qk = QK8_0;
254
227
  const int nb = n / qk;
255
228
 
@@ -267,7 +240,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
267
240
  const block_q5_0 * GGML_RESTRICT x = vx;
268
241
  const block_q8_0 * GGML_RESTRICT y = vy;
269
242
 
270
- #if defined(__riscv_v)
271
243
  size_t vl;
272
244
  size_t vlenb = __riscv_vlenb();
273
245
 
@@ -297,33 +269,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
297
269
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
298
270
  }
299
271
 
300
- #endif
301
- for (; ib < nb; ++ib) {
302
- uint32_t qh;
303
- memcpy(&qh, x[ib].qh, sizeof(qh));
304
-
305
- int sumi0 = 0;
306
- int sumi1 = 0;
307
-
308
- for (int j = 0; j < qk/2; ++j) {
309
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
310
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
311
-
312
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
313
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
314
-
315
- sumi0 += (x0 * y[ib].qs[j]);
316
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
317
- }
318
-
319
- int sumi = sumi0 + sumi1;
320
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
321
- }
322
-
323
272
  *s = sumf;
273
+ #else
274
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
275
+ #endif
324
276
  }
325
277
 
326
278
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
279
+ #if defined(__riscv_v)
327
280
  const int qk = QK8_1;
328
281
  const int nb = n / qk;
329
282
 
@@ -341,7 +294,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
341
294
  const block_q5_1 * GGML_RESTRICT x = vx;
342
295
  const block_q8_1 * GGML_RESTRICT y = vy;
343
296
 
344
- #if defined(__riscv_v)
345
297
  size_t vl;
346
298
  size_t vlenb = __riscv_vlenb();
347
299
 
@@ -370,30 +322,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
370
322
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
371
323
  }
372
324
 
373
- #endif
374
- for (; ib < nb; ++ib) {
375
- uint32_t qh;
376
- memcpy(&qh, x[ib].qh, sizeof(qh));
377
-
378
- int sumi0 = 0;
379
- int sumi1 = 0;
380
-
381
- for (int j = 0; j < qk/2; ++j) {
382
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
383
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
384
-
385
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
386
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
387
-
388
- sumi0 += (x0 * y[ib].qs[j]);
389
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
390
- }
391
-
392
- int sumi = sumi0 + sumi1;
393
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
394
- }
395
-
396
325
  *s = sumf;
326
+ #else
327
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
328
+ #endif
397
329
  }
398
330
 
399
331
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -431,18 +363,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
431
363
  sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
432
364
  }
433
365
 
434
- #endif
435
- for (; ib < nb; ++ib) {
436
- int sumi = 0;
437
-
438
- for (int j = 0; j < qk; j++) {
439
- sumi += x[ib].qs[j]*y[ib].qs[j];
440
- }
366
+ *s = sumf;
367
+ #else
441
368
 
442
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
443
- }
369
+ UNUSED(nb);
370
+ UNUSED(x);
371
+ UNUSED(y);
372
+ UNUSED(ib);
373
+ UNUSED(sumf);
444
374
 
445
- *s = sumf;
375
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
376
+ #endif
446
377
  }
447
378
 
448
379
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -738,44 +669,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
738
669
 
739
670
  #else
740
671
 
741
- float sumf = 0;
742
-
743
- for (int i = 0; i < nb; ++i) {
744
-
745
- const uint8_t * q2 = x[i].qs;
746
- const int8_t * q8 = y[i].qs;
747
- const uint8_t * sc = x[i].scales;
672
+ UNUSED(x);
673
+ UNUSED(y);
674
+ UNUSED(nb);
748
675
 
749
- int summs = 0;
750
- for (int j = 0; j < 16; ++j) {
751
- summs += y[i].bsums[j] * (sc[j] >> 4);
752
- }
753
-
754
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
755
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
756
-
757
- int isum = 0;
758
- int is = 0;
759
- int d;
760
- for (int k = 0; k < QK_K/128; ++k) {
761
- int shift = 0;
762
- for (int j = 0; j < 4; ++j) {
763
- d = sc[is++] & 0xF;
764
- int isuml = 0;
765
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
766
- isum += d * isuml;
767
- d = sc[is++] & 0xF;
768
- isuml = 0;
769
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
770
- isum += d * isuml;
771
- shift += 2;
772
- q8 += 32;
773
- }
774
- q2 += 32;
775
- }
776
- sumf += dall * isum - dmin * summs;
777
- }
778
- *s = sumf;
676
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
779
677
  #endif
780
678
  }
781
679
 
@@ -1147,68 +1045,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1147
1045
  *s = sumf;
1148
1046
 
1149
1047
  #else
1150
- // scalar version
1151
- // This function is written like this so the compiler can manage to vectorize most of it
1152
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1153
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1154
- // The ideal situation would be if we could just write the code once, and the compiler would
1155
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1156
- // write vectorized versions for AVX, ARM_NEON, etc.
1157
-
1158
- int8_t aux8[QK_K];
1159
- int16_t aux16[8];
1160
- float sums [8];
1161
- int32_t aux32[8];
1162
- memset(sums, 0, 8*sizeof(float));
1163
-
1164
- uint32_t auxs[4];
1165
- const int8_t * scales = (const int8_t*)auxs;
1166
1048
 
1167
- float sumf = 0;
1168
- for (int i = 0; i < nb; ++i) {
1169
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1170
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1171
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1172
- memset(aux32, 0, 8*sizeof(int32_t));
1173
- int8_t * GGML_RESTRICT a = aux8;
1174
- uint8_t m = 1;
1175
- for (int j = 0; j < QK_K; j += 128) {
1176
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1177
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1178
- a += 32; m <<= 1;
1179
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1180
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1181
- a += 32; m <<= 1;
1182
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1183
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1184
- a += 32; m <<= 1;
1185
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1186
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1187
- a += 32; m <<= 1;
1188
- q3 += 32;
1189
- }
1190
- a = aux8;
1191
-
1192
- memcpy(auxs, x[i].scales, 12);
1193
- uint32_t tmp = auxs[2];
1194
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1195
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1196
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1197
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1198
- for (int j = 0; j < QK_K/16; ++j) {
1199
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1200
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1201
- q8 += 8; a += 8;
1202
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1203
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1204
- q8 += 8; a += 8;
1205
- }
1206
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1207
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1208
- }
1209
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1210
- *s = sumf;
1049
+ UNUSED(kmask1);
1050
+ UNUSED(kmask2);
1051
+ UNUSED(x);
1052
+ UNUSED(y);
1053
+ UNUSED(nb);
1211
1054
 
1055
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1212
1056
  #endif
1213
1057
 
1214
1058
  }
@@ -1426,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1426
1270
  const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1427
1271
  const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1428
1272
 
1429
- int tmp, tmp2, sumi;
1273
+ float ftmp, ft2;
1274
+ const uint8_t * restrict q40;
1275
+ const uint8_t * restrict q41;
1276
+ const uint8_t * restrict q42;
1277
+ const uint8_t * restrict q43;
1278
+ const int8_t * restrict q80;
1279
+ const int8_t * restrict q81;
1280
+ const int8_t * restrict q82;
1281
+ const int8_t * restrict q83;
1282
+ int s0, s1, s2, s3;
1283
+
1430
1284
  __asm__ __volatile__(
1431
- "vsetivli zero, 12, e8, m1\n\t"
1432
- "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
1433
- "vsetivli zero, 4, e32, m1\n\t"
1285
+ "li %[s1], 8\n\t"
1286
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1287
+ "vle32.v v1, (%[s6b])\n\t"
1288
+ "vslide1down.vx v1, v1, zero\n\t"
1289
+ "vmv.v.x v16, zero\n\t"
1434
1290
  "vslidedown.vi v2, v1, 2\n\t"
1435
1291
  "vmv1r.v v3, v2\n\t"
1436
1292
  "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
1437
- "vsetivli zero, 2, e32, m1\n\t"
1293
+ "vsetivli zero, 2, e32, m1, ta, ma\n\t"
1438
1294
  "vmv.v.i v4, 4\n\t"
1439
1295
  "vand.vx v8, v1, %[kmask1]\n\t"
1440
1296
  "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
1441
1297
  "vsrl.vi v6, v1, 6\n\t"
1442
1298
  "vsrl.vv v7, v2, v5\n\t"
1299
+ "vsse32.v v8, (%[utmp]), %[s1]\n\t"
1443
1300
  "vand.vx v0, v6, %[kmask3]\n\t"
1444
1301
  "vand.vx v2, v7, %[kmask2]\n\t"
1445
1302
  "vsll.vi v6, v0, 4\n\t"
1446
- "li %[t2], 8\n\t"
1447
- "addi %[t1], %[utmp], 4\n\t"
1303
+ "addi %[s0], %[utmp], 4\n\t"
1448
1304
  "vor.vv v1, v6, v2\n\t"
1449
- "vsse32.v v8, (%[utmp]), %[t2]\n\t"
1450
- "vsse32.v v1, (%[t1]), %[t2]\n\t"
1451
- "vsetivli zero, 8, e16, m1\n\t"
1305
+ "vsse32.v v1, (%[s0]), %[s1]\n\t"
1306
+ "vsetivli zero, 8, e16, m1, ta, ma\n\t"
1452
1307
  "vle32.v v2, (%[bsums])\n\t"
1453
1308
  "vnsrl.wi v0, v2, 0\n\t"
1454
1309
  "vnsrl.wi v1, v2, 16\n\t"
@@ -1456,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1456
1311
  "vle8.v v3, (%[mins])\n\t"
1457
1312
  "vzext.vf2 v4, v3\n\t"
1458
1313
  "vwmul.vv v6, v4, v2\n\t"
1314
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1315
+ "vredsum.vs v0, v6, v16\n\t"
1316
+ "vredsum.vs v0, v7, v0\n\t"
1317
+ "vfcvt.f.x.v v0, v0\n\t"
1318
+ "vfmv.f.s %[ftmp], v0\n\t"
1319
+ "vsetivli zero, 16, e8, m1, ta, ma\n\t"
1320
+ "vle8.v v0, (%[xs])\n\t"
1321
+ "fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
1322
+ "addi %[q40], %[xs], 64\n\t"
1323
+ "addi %[q41], %[xs], 16\n\t"
1324
+ "addi %[q42], %[xs], 32\n\t"
1325
+ "addi %[q43], %[xs], 48\n\t"
1326
+ "addi %[q80], %[ys], 64\n\t"
1327
+ "vle8.v v1, (%[q41])\n\t"
1328
+ "vle8.v v2, (%[q42])\n\t"
1329
+ "addi %[q81], %[ys], 16\n\t"
1330
+ "addi %[q41], %[q41], 64\n\t"
1331
+ "addi %[q82], %[ys], 32\n\t"
1332
+ "vle8.v v3, (%[q43])\n\t"
1333
+ "vle8.v v8, (%[ys])\n\t"
1334
+ "addi %[q42], %[q42], 64\n\t"
1335
+ "addi %[q83], %[ys], 48\n\t"
1336
+ "addi %[q43], %[q43], 64\n\t"
1337
+ "vsrl.vi v4, v0, 4\n\t"
1338
+ "vle8.v v9, (%[q81])\n\t"
1339
+ "vle8.v v10, (%[q82])\n\t"
1340
+ "vand.vi v0, v0, 0xF\n\t"
1341
+ "addi %[q81], %[q81], 64\n\t"
1342
+ "vsrl.vi v5, v1, 4\n\t"
1343
+ "addi %[q82], %[q82], 64\n\t"
1344
+ "vle8.v v11, (%[q83])\n\t"
1345
+ "vle8.v v12, (%[q80])\n\t"
1346
+ "vand.vi v1, v1, 0xF\n\t"
1347
+ "addi %[q83], %[q83], 64\n\t"
1348
+ "vsrl.vi v6, v2, 4\n\t"
1349
+ "addi %[q80], %[q80], 64\n\t"
1350
+ "vle8.v v13, (%[q81])\n\t"
1351
+ "vle8.v v14, (%[q82])\n\t"
1352
+ "vand.vi v2, v2, 0xF\n\t"
1353
+ "addi %[q81], %[q81], 64\n\t"
1354
+ "vsrl.vi v7, v3, 4\n\t"
1355
+ "addi %[q82], %[q82], 64\n\t"
1356
+ "vwmul.vv v16, v0, v8\n\t"
1357
+ "vle8.v v15, (%[q83])\n\t"
1358
+ "vle8.v v0, (%[q40])\n\t"
1359
+ "vand.vi v3, v3, 0xF\n\t"
1360
+ "addi %[q83], %[q83], 64\n\t"
1361
+ "vwmul.vv v24, v2, v12\n\t"
1362
+ "vwmul.vv v20, v4, v10\n\t"
1363
+ "vwmul.vv v28, v6, v14\n\t"
1364
+ "vwmacc.vv v16, v1, v9\n\t"
1365
+ "vle8.v v1, (%[q41])\n\t"
1366
+ "vle8.v v2, (%[q42])\n\t"
1367
+ "vwmacc.vv v24, v3, v13\n\t"
1368
+ "vwmacc.vv v20, v5, v11\n\t"
1369
+ "vwmacc.vv v28, v7, v15\n\t"
1370
+ "addi %[q40], %[q80], 64\n\t"
1371
+ "addi %[q41], %[q81], 64\n\t"
1372
+ "vle8.v v3, (%[q43])\n\t"
1373
+ "vle8.v v8, (%[q80])\n\t"
1374
+ "addi %[q42], %[q82], 64\n\t"
1375
+ "addi %[q43], %[q83], 64\n\t"
1376
+ "vsrl.vi v4, v0, 4\n\t"
1377
+ "vle8.v v9, (%[q81])\n\t"
1378
+ "vle8.v v10, (%[q82])\n\t"
1379
+ "vand.vi v0, v0, 0xF\n\t"
1380
+ "vsrl.vi v5, v1, 4\n\t"
1381
+ "vsrl.vi v7, v3, 4\n\t"
1382
+ "vand.vi v3, v3, 0xF\n\t"
1383
+ "vle8.v v11, (%[q83])\n\t"
1384
+ "vle8.v v12, (%[q40])\n\t"
1385
+ "vand.vi v1, v1, 0xF\n\t"
1386
+ "vsrl.vi v6, v2, 4\n\t"
1387
+ "vand.vi v2, v2, 0xF\n\t"
1388
+ "vwmul.vv v18, v0, v8\n\t"
1389
+ "vle8.v v13, (%[q41])\n\t"
1390
+ "vle8.v v14, (%[q42])\n\t"
1391
+ "vwmul.vv v26, v2, v12\n\t"
1392
+ "vwmul.vv v22, v4, v10\n\t"
1393
+ "vwmul.vv v30, v6, v14\n\t"
1394
+ "vwmacc.vv v18, v1, v9\n\t"
1395
+ "vle8.v v15, (%[q43])\n\t"
1396
+ "vwmacc.vv v26, v3, v13\n\t"
1397
+ "vwmacc.vv v22, v5, v11\n\t"
1398
+ "vwmacc.vv v30, v7, v15\n\t"
1459
1399
  "vmv.v.x v0, zero\n\t"
1460
- "vsetivli zero, 8, e32, m2\n\t"
1461
- "vredsum.vs v0, v6, v0\n\t"
1462
- "vmv.x.s %[sumi], v0"
1463
- : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
1464
- : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1465
- , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
1400
+ "vsetivli zero, 16, e16, m2, ta, ma\n\t"
1401
+ "vwredsum.vs v4, v16, v0\n\t"
1402
+ "lbu %[s0], 0(%[scale])\n\t"
1403
+ "vwredsum.vs v5, v20, v0\n\t"
1404
+ "lbu %[s1], 1(%[scale])\n\t"
1405
+ "vwredsum.vs v6, v24, v0\n\t"
1406
+ "lbu %[s2], 2(%[scale])\n\t"
1407
+ "vwredsum.vs v7, v28, v0\n\t"
1408
+ "lbu %[s3], 3(%[scale])\n\t"
1409
+ "vwredsum.vs v8, v18, v0\n\t"
1410
+ "lbu %[q40], 4(%[scale])\n\t"
1411
+ "vwredsum.vs v9, v22, v0\n\t"
1412
+ "lbu %[q41], 5(%[scale])\n\t"
1413
+ "vwredsum.vs v10, v26, v0\n\t"
1414
+ "lbu %[q42], 6(%[scale])\n\t"
1415
+ "vwredsum.vs v11, v30, v0\n\t"
1416
+ "lbu %[q43], 7(%[scale])\n\t"
1417
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1418
+ "vmul.vx v0, v4, %[s0]\n\t"
1419
+ "vmul.vx v1, v8, %[q40]\n\t"
1420
+ "vmacc.vx v0, %[s1], v5\n\t"
1421
+ "vmacc.vx v1, %[q41], v9\n\t"
1422
+ "vmacc.vx v0, %[s2], v6\n\t"
1423
+ "vmacc.vx v1, %[q42], v10\n\t"
1424
+ "vmacc.vx v0, %[s3], v7\n\t"
1425
+ "vmacc.vx v1, %[q43], v11\n\t"
1426
+ "vfcvt.f.x.v v0, v0\n\t"
1427
+ "vfcvt.f.x.v v1, v1\n\t"
1428
+ "vfmv.f.s %[ft2], v0\n\t"
1429
+ "vfmv.f.s %[ftmp], v1\n\t"
1430
+ "fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
1431
+ "fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
1432
+ : [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
1433
+ , [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
1434
+ , [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
1435
+ , [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
1436
+ : [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
1437
+ , [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1438
+ , [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
1466
1439
  , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
1467
1440
  : "memory"
1468
1441
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
@@ -1470,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1470
1443
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1471
1444
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1472
1445
  );
1473
- sumf -= dmin * sumi;
1474
-
1475
- const uint8_t * restrict q4 = x[i].qs;
1476
- const int8_t * restrict q8 = y[i].qs;
1477
-
1478
- sumi = 0;
1479
- const uint8_t * scale = scales;
1480
-
1481
- for (int j = 0; j < QK_K/128; ++j) {
1482
- int vl128 = 128, vl64 = 64, vl32 = 32;
1483
- __asm__ __volatile__(
1484
- "vsetvli zero, %[vl128], e8, m8\n\t"
1485
- "vle8.v v8, (%[q8])\n\t"
1486
- "vsetvli zero, %[vl64], e8, m4\n\t"
1487
- "vle8.v v0, (%[q4])\n\t"
1488
- "vsrl.vi v4, v0, 4\n\t"
1489
- "vand.vi v0, v0, 0xF\n\t"
1490
- "vsetvli zero, %[vl32], e8, m2\n\t"
1491
- "vwmul.vv v28, v6, v14\n\t"
1492
- "vwmul.vv v20, v4, v10\n\t"
1493
- "vwmul.vv v24, v2, v12\n\t"
1494
- "vwmul.vv v16, v0, v8\n\t"
1495
- "vsetivli zero, 4, e32, m1\n\t"
1496
- "vle8.v v2, (%[scale])\n\t"
1497
- "vmv.v.x v0, zero\n\t"
1498
- "vzext.vf4 v1, v2\n\t"
1499
- "vsetvli zero, %[vl32], e16, m4\n\t"
1500
- "vwredsum.vs v6, v24, v0\n\t"
1501
- "vwredsum.vs v7, v28, v0\n\t"
1502
- "vwredsum.vs v4, v16, v0\n\t"
1503
- "vwredsum.vs v5, v20, v0\n\t"
1504
- "vsetivli zero, 4, e32, m1\n\t"
1505
- "vslideup.vi v6, v7, 1\n\t"
1506
- "vslideup.vi v4, v5, 1\n\t"
1507
- "vslideup.vi v4, v6, 2\n\t"
1508
- "vmul.vv v8, v4, v1\n\t"
1509
- "vredsum.vs v0, v8, v0\n\t"
1510
- "vmv.x.s %[tmp], v0\n\t"
1511
- "add %[sumi], %[sumi], %[tmp]"
1512
- : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
1513
- : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
1514
- , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
1515
- : "memory"
1516
- , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1517
- , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1518
- , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1519
- , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1520
- );
1521
-
1522
- q4 += 64; q8 += 128; scale += 4;
1523
- }
1524
-
1525
- sumf += d * sumi;
1526
1446
  }
1527
1447
  break;
1528
1448
  default:
@@ -1534,60 +1454,15 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1534
1454
 
1535
1455
  #else
1536
1456
 
1537
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1538
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1539
-
1540
- int8_t aux8[QK_K];
1541
- int16_t aux16[8];
1542
- float sums [8];
1543
- int32_t aux32[8];
1544
- memset(sums, 0, 8*sizeof(float));
1545
-
1546
- float sumf = 0;
1547
- for (int i = 0; i < nb; ++i) {
1548
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1549
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1550
- memset(aux32, 0, 8*sizeof(int32_t));
1551
- int8_t * GGML_RESTRICT a = aux8;
1552
- for (int j = 0; j < QK_K/64; ++j) {
1553
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1554
- a += 32;
1555
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1556
- a += 32; q4 += 32;
1557
- }
1558
- memcpy(utmp, x[i].scales, 12);
1559
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1560
- const uint32_t uaux = utmp[1] & kmask1;
1561
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1562
- utmp[2] = uaux;
1563
- utmp[0] &= kmask1;
1457
+ UNUSED(x);
1458
+ UNUSED(y);
1459
+ UNUSED(kmask1);
1460
+ UNUSED(kmask2);
1461
+ UNUSED(kmask3);
1462
+ UNUSED(nb);
1463
+ UNUSED(utmp);
1564
1464
 
1565
- int sumi = 0;
1566
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1567
- a = aux8;
1568
- int is = 0;
1569
- for (int j = 0; j < QK_K/32; ++j) {
1570
- int32_t scale = scales[is++];
1571
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1572
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1573
- q8 += 8; a += 8;
1574
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1575
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1576
- q8 += 8; a += 8;
1577
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1578
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1579
- q8 += 8; a += 8;
1580
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1581
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1582
- q8 += 8; a += 8;
1583
- }
1584
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1585
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1586
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1587
- sumf -= dmin * sumi;
1588
- }
1589
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1590
- *s = sumf;
1465
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1591
1466
  #endif
1592
1467
  }
1593
1468
 
@@ -1698,65 +1573,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1698
1573
 
1699
1574
  #else
1700
1575
 
1701
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1702
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1703
-
1704
- int8_t aux8[QK_K];
1705
- int16_t aux16[8];
1706
- float sums [8];
1707
- int32_t aux32[8];
1708
- memset(sums, 0, 8*sizeof(float));
1576
+ UNUSED(x);
1577
+ UNUSED(y);
1578
+ UNUSED(kmask1);
1579
+ UNUSED(kmask2);
1580
+ UNUSED(kmask3);
1581
+ UNUSED(nb);
1582
+ UNUSED(utmp);
1709
1583
 
1710
- float sumf = 0;
1711
- for (int i = 0; i < nb; ++i) {
1712
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1713
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1714
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1715
- memset(aux32, 0, 8*sizeof(int32_t));
1716
- int8_t * GGML_RESTRICT a = aux8;
1717
- uint8_t m = 1;
1718
- for (int j = 0; j < QK_K/64; ++j) {
1719
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1720
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1721
- a += 32; m <<= 1;
1722
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1723
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1724
- a += 32; m <<= 1;
1725
- q4 += 32;
1726
- }
1727
- memcpy(utmp, x[i].scales, 12);
1728
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1729
- const uint32_t uaux = utmp[1] & kmask1;
1730
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1731
- utmp[2] = uaux;
1732
- utmp[0] &= kmask1;
1733
-
1734
- int sumi = 0;
1735
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1736
- a = aux8;
1737
- int is = 0;
1738
- for (int j = 0; j < QK_K/32; ++j) {
1739
- int32_t scale = scales[is++];
1740
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1741
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1742
- q8 += 8; a += 8;
1743
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1744
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1745
- q8 += 8; a += 8;
1746
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1747
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1748
- q8 += 8; a += 8;
1749
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1750
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1751
- q8 += 8; a += 8;
1752
- }
1753
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1754
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1755
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1756
- sumf -= dmin * sumi;
1757
- }
1758
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1759
- *s = sumf;
1584
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1760
1585
  #endif
1761
1586
  }
1762
1587
 
@@ -1944,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1944
1769
  case 128:
1945
1770
  for (int i = 0; i < nb; ++i) {
1946
1771
 
1772
+ __builtin_prefetch(&x[i + 1].d, 0, 1);
1773
+
1947
1774
  const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1948
1775
 
1949
1776
  const uint8_t * restrict q6 = x[i].ql;
@@ -1952,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1952
1779
 
1953
1780
  const int8_t * restrict scale = x[i].scales;
1954
1781
 
1955
- int sum_t = 0;
1956
- int t0;
1782
+ int q6h;
1783
+ float ftmp;
1957
1784
 
1958
1785
  for (int j = 0; j < QK_K/128; ++j) {
1959
1786
  __asm__ __volatile__(
1787
+ "addi %[q6h], %[q6], 32\n\t"
1788
+ "ld t0, 0(%[scale])\n\t"
1789
+ "addi %[scale], %[scale], 8\n\t"
1790
+ "slli t6, t0, 1 * 8\n\t"
1791
+ "lb zero, 0(%[q6])\n\t"
1792
+ "slli t5, t0, 2 * 8\n\t"
1793
+ "slli t4, t0, 3 * 8\n\t"
1794
+ "lb zero, 0(%[q6h])\n\t"
1795
+ "slli t3, t0, 4 * 8\n\t"
1796
+ "slli t2, t0, 5 * 8\n\t"
1797
+ "lb zero, 0(%[qh])\n\t"
1798
+ "lb zero, 31(%[q6h])\n\t"
1799
+ "slli t1, t0, 6 * 8\n\t"
1800
+ "srai a7, t0, 56\n\t"
1960
1801
  "vsetvli zero, %[vl32], e8, m2\n\t"
1802
+ "vle8.v v8, (%[q6])\n\t"
1803
+ "srai t6, t6, 56\n\t"
1804
+ "srai t5, t5, 56\n\t"
1805
+ "srai t4, t4, 56\n\t"
1806
+ "srai t3, t3, 56\n\t"
1807
+ "vle8.v v10, (%[q6h])\n\t"
1808
+ "addi %[q6], %[q6], 64\n\t"
1809
+ "slli t0, t0, 7 * 8\n\t"
1810
+ "srai t2, t2, 56\n\t"
1811
+ "srai t1, t1, 56\n\t"
1812
+ "srai t0, t0, 56\n\t"
1961
1813
  "vle8.v v4, (%[qh])\n\t"
1814
+ "vsrl.vi v12, v8, 4\n\t"
1815
+ "vsrl.vi v14, v10, 4\n\t"
1816
+ "lb zero, 0(%[q8])\n\t"
1817
+ "vand.vi v8, v8, 0xF\n\t"
1818
+ "vand.vi v10, v10, 0xF\n\t"
1819
+ "lb zero, 32(%[q8])\n\t"
1962
1820
  "vsll.vi v0, v4, 4\n\t"
1963
1821
  "vsll.vi v2, v4, 2\n\t"
1822
+ "lb zero, 64(%[q8])\n\t"
1964
1823
  "vsrl.vi v6, v4, 2\n\t"
1965
- "vsetvli zero, %[vl64], e8, m4\n\t"
1966
- "vle8.v v8, (%[q6])\n\t"
1967
- "vsrl.vi v12, v8, 4\n\t"
1968
- "vand.vi v8, v8, 0xF\n\t"
1969
- "vsetvli zero, %[vl128], e8, m8\n\t"
1970
1824
  "vand.vx v0, v0, %[mask]\n\t"
1825
+ "lb zero, 96(%[q8])\n\t"
1826
+ "vand.vx v2, v2, %[mask]\n\t"
1827
+ "vand.vx v4, v4, %[mask]\n\t"
1828
+ "vand.vx v6, v6, %[mask]\n\t"
1971
1829
  "vor.vv v8, v8, v0\n\t"
1830
+ "lb zero, 127(%[q8])\n\t"
1831
+ "vor.vv v10, v10, v2\n\t"
1832
+ "vor.vv v12, v12, v4\n\t"
1833
+ "vor.vv v14, v14, v6\n\t"
1834
+ "vsetvli zero, %[vl128], e8, m8\n\t"
1972
1835
  "vle8.v v0, (%[q8])\n\t"
1973
1836
  "vsub.vx v8, v8, %[vl32]\n\t"
1974
1837
  "vsetvli zero, %[vl64], e8, m4\n\t"
@@ -1985,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1985
1848
  "vwredsum.vs v13, v28, v0\n\t"
1986
1849
  "vwredsum.vs v14, v30, v0\n\t"
1987
1850
  "vsetivli zero, 4, e32, m1\n\t"
1988
- "vslideup.vi v10, v9, 1\n\t"
1989
- "vslideup.vi v8, v7, 1\n\t"
1990
- "vslideup.vi v11, v12, 1\n\t"
1991
- "vslideup.vi v13, v14, 1\n\t"
1992
- "vslideup.vi v10, v8, 2\n\t"
1993
- "vslideup.vi v11, v13, 2\n\t"
1994
- "vsetivli zero, 8, e32, m2\n\t"
1995
- "vle8.v v2, (%[scale])\n\t"
1996
- "vsext.vf4 v4, v2\n\t"
1997
- "vmul.vv v2, v4, v10\n\t"
1998
- "vredsum.vs v0, v2, v0\n\t"
1999
- "vmv.x.s %[t0], v0\n\t"
2000
- "add %[sumi], %[sumi], %[t0]"
2001
- : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
2002
- : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
1851
+ "vmul.vx v0, v10, t0\n\t"
1852
+ "vmul.vx v1, v9, t1\n\t"
1853
+ "vmacc.vx v0, t2, v8\n\t"
1854
+ "vmacc.vx v1, t3, v7\n\t"
1855
+ "vmacc.vx v0, t4, v11\n\t"
1856
+ "vmacc.vx v1, t5, v12\n\t"
1857
+ "vmacc.vx v0, t6, v13\n\t"
1858
+ "vmacc.vx v1, a7, v14\n\t"
1859
+ "vadd.vv v0, v0, v1\n\t"
1860
+ "vfcvt.f.x.v v0, v0\n\t"
1861
+ "vfmv.f.s %[ftmp], v0\n\t"
1862
+ "fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
1863
+ : [q6] "+&r" (q6), [q6h] "=&r" (q6h)
1864
+ , [scale] "+&r" (scale)
1865
+ , [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
1866
+ : [qh] "r" (qh), [q8] "r" (q8)
2003
1867
  , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
2004
- , [mask] "r" (0x30)
1868
+ , [mask] "r" (0x30), [d] "f" (d)
2005
1869
  : "memory"
2006
1870
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2007
1871
  , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
2008
1872
  , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
2009
1873
  , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1874
+ , "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
1875
+ , "a6", "a5", "a4", "a3"
2010
1876
  );
2011
- q6 += 64; qh += 32; q8 += 128; scale += 8;
1877
+ qh += 32; q8 += 128;
2012
1878
  }
2013
-
2014
- sumf += d * sum_t;
2015
-
2016
1879
  }
2017
1880
  break;
2018
1881
  default:
@@ -2024,46 +1887,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2024
1887
 
2025
1888
  #else
2026
1889
 
2027
- int8_t aux8[QK_K];
2028
- int16_t aux16[8];
2029
- float sums [8];
2030
- int32_t aux32[8];
2031
- memset(sums, 0, 8*sizeof(float));
1890
+ UNUSED(x);
1891
+ UNUSED(y);
1892
+ UNUSED(nb);
2032
1893
 
2033
- float sumf = 0;
2034
- for (int i = 0; i < nb; ++i) {
2035
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2036
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2037
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2038
- memset(aux32, 0, 8*sizeof(int32_t));
2039
- int8_t * GGML_RESTRICT a = aux8;
2040
- for (int j = 0; j < QK_K; j += 128) {
2041
- for (int l = 0; l < 32; ++l) {
2042
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2043
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2044
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2045
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2046
- }
2047
- a += 128;
2048
- q4 += 64;
2049
- qh += 32;
2050
- }
2051
- a = aux8;
2052
- int is = 0;
2053
- for (int j = 0; j < QK_K/16; ++j) {
2054
- int scale = x[i].scales[is++];
2055
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2056
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2057
- q8 += 8; a += 8;
2058
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2059
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2060
- q8 += 8; a += 8;
2061
- }
2062
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2063
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2064
- }
2065
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2066
- *s = sumf;
1894
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2067
1895
  #endif
2068
1896
  }
2069
1897