whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,1468 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
7
+
8
+ #include "../../quants.h"
9
+ #include "../../ggml-cpu-impl.h"
10
+
11
+ #include <math.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+ #include <float.h>
15
+ #include <stdlib.h> // for qsort
16
+ #include <stdio.h> // for GGML_ASSERT
17
+
18
+ #define GROUP_MAX_EPS 1e-15f
19
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
21
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
22
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
23
+
24
+ #define UNUSED GGML_UNUSED
25
+
26
+ #if defined(__VXE__) || defined(__VXE2__)
27
+ #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28
+ #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29
+ #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30
+ #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31
+ #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32
+ #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33
+ #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34
+ #define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
+
36
+ // precomputed tables for expanding 8bits to 8 bytes:
37
+ static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
38
+ static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
+
40
+ // permute mask for byteswapping
41
+ static const uint8x16_t v_kperm = (const uint8x16_t){
42
+ 7, 6, 5, 4, 3, 2, 1, 0,
43
+ 15, 14, 13, 12, 11, 10, 9, 8
44
+ };
45
+ #endif
46
+
47
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
48
+ assert(QK8_0 == 32);
49
+ assert(k % QK8_0 == 0);
50
+ const int nb = k / QK8_0;
51
+
52
+ block_q8_0 * GGML_RESTRICT y = vy;
53
+
54
+ #if defined(__VXE__) || defined(__VXE2__)
55
+ for (int i = 0; i < nb; i++) {
56
+ float32x4_t srcv [8];
57
+ float32x4_t asrcv[8];
58
+ float32x4_t amaxv[8];
59
+
60
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
61
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
62
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
63
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
64
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
65
+
66
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
67
+ vec_extract(amaxv[0], 1)),
68
+ MAX(vec_extract(amaxv[0], 2),
69
+ vec_extract(amaxv[0], 3)));
70
+
71
+ const float d = amax / ((1 << 7) - 1);
72
+ const float id = d ? 1.0f / d : 0.0f;
73
+
74
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
75
+
76
+ for (int j = 0; j < 8; j++) {
77
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
+ /* Uses non-default rounding for vec_signed or vec_round */
79
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
80
+
81
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
82
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
83
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
84
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
85
+ }
86
+ }
87
+ #else
88
+ GGML_UNUSED(nb);
89
+ // scalar
90
+ quantize_row_q8_0_ref(x, y, k);
91
+ #endif
92
+ }
93
+
94
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
95
+ assert(k % QK8_1 == 0);
96
+ const int nb = k / QK8_1;
97
+
98
+ block_q8_1 * GGML_RESTRICT y = vy;
99
+
100
+ #if defined(__VXE__) || defined(__VXE2__)
101
+ for (int i = 0; i < nb; i++) {
102
+ float32x4_t srcv [8];
103
+ float32x4_t asrcv[8];
104
+ float32x4_t amaxv[8];
105
+
106
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
107
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
108
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
109
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
110
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
111
+
112
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
113
+ vec_extract(amaxv[0], 1)),
114
+ MAX(vec_extract(amaxv[0], 2),
115
+ vec_extract(amaxv[0], 3)));
116
+
117
+ const float d = amax / ((1 << 7) - 1);
118
+ const float id = d ? 1.0f / d : 0.0f;
119
+
120
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
121
+
122
+ int32x4_t acc = vec_splats(0);
123
+
124
+ for (int j = 0; j < 8; j++) {
125
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
126
+ /* Uses non-default rounding for vec_signed or vec_round */
127
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
128
+
129
+ y[i].qs[4*j + 0] = vec_extract(vi, 0);
130
+ y[i].qs[4*j + 1] = vec_extract(vi, 1);
131
+ y[i].qs[4*j + 2] = vec_extract(vi, 2);
132
+ y[i].qs[4*j + 3] = vec_extract(vi, 3);
133
+
134
+ acc = vec_add(acc, vi);
135
+ }
136
+
137
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
138
+ }
139
+ #else
140
+ GGML_UNUSED(nb);
141
+ // scalar
142
+ quantize_row_q8_1_ref(x, y, k);
143
+ #endif
144
+ }
145
+
146
+
147
+ //===================================== Dot products =================================
148
+
149
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
150
+ const int qk = QK8_0;
151
+ const int nb = n / qk;
152
+
153
+ assert(n % qk == 0);
154
+ assert(nrc == 1);
155
+ UNUSED(nrc);
156
+ UNUSED(bx);
157
+ UNUSED(by);
158
+ UNUSED(bs);
159
+
160
+ const block_q4_0 * GGML_RESTRICT x = vx;
161
+ const block_q8_0 * GGML_RESTRICT y = vy;
162
+
163
+ int ib = 0;
164
+ float sumf = 0;
165
+
166
+ #if defined(__VXE__) || defined(__VXE2__)
167
+ float32x4_t acc = vec_splats(0.0f);
168
+
169
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
170
+ const int8x16_t v_s = vec_splats( (const int8_t)0x08);
171
+
172
+ for (; ib < nb; ++ib) {
173
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
174
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
175
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
176
+
177
+ const int8x16_t v_xls = vec_sub(v_xl, v_s);
178
+ const int8x16_t v_xhs = vec_sub(v_xh, v_s);
179
+
180
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
181
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
182
+
183
+ const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
184
+ const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
185
+ const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
186
+ const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
187
+
188
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
189
+
190
+ const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
191
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
192
+
193
+ acc = vec_madd(v_xy, v_d, acc);
194
+ }
195
+
196
+ sumf = vec_hsum_f32x4(acc);
197
+ *s = sumf;
198
+ #else
199
+ UNUSED(nb);
200
+ UNUSED(x);
201
+ UNUSED(y);
202
+ UNUSED(ib);
203
+ UNUSED(sumf);
204
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
205
+ #endif
206
+ }
207
+
208
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
209
+ const int qk = QK8_1;
210
+ const int nb = n / qk;
211
+
212
+ assert(n % qk == 0);
213
+ assert(nrc == 1);
214
+ UNUSED(nrc);
215
+ UNUSED(bx);
216
+ UNUSED(by);
217
+ UNUSED(bs);
218
+
219
+ const block_q4_1 * GGML_RESTRICT x = vx;
220
+ const block_q8_1 * GGML_RESTRICT y = vy;
221
+
222
+ int ib = 0;
223
+ float sumf = 0;
224
+
225
+ #if defined(__VXE__) || defined(__VXE2__)
226
+ float summs = 0;
227
+ float32x4_t acc = vec_splats(0.0f);
228
+
229
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
230
+
231
+ #pragma GCC unroll 4
232
+ for (; ib < nb; ++ib) {
233
+ __builtin_prefetch(x[ib].qs, 0, 1);
234
+ __builtin_prefetch(y[ib].qs, 0, 1);
235
+
236
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
237
+
238
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
239
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
240
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
241
+
242
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
243
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
244
+
245
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
246
+ const float32x4_t v_xy = vec_float(v_xy_);
247
+
248
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
249
+
250
+ acc = vec_madd(v_xy, v_d, acc);
251
+ }
252
+
253
+ sumf = vec_hsum_f32x4(acc) + summs;
254
+ *s = sumf;
255
+ #else
256
+ UNUSED(nb);
257
+ UNUSED(x);
258
+ UNUSED(y);
259
+ UNUSED(ib);
260
+ UNUSED(sumf);
261
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
262
+ #endif
263
+ }
264
+
265
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266
+ assert(nrc == 1);
267
+ UNUSED(nrc);
268
+ UNUSED(bx);
269
+ UNUSED(by);
270
+ UNUSED(bs);
271
+ assert(n % QK_MXFP4 == 0);
272
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
273
+
274
+ const int qk = QK_MXFP4;
275
+ const int nb = n / qk;
276
+
277
+ const block_mxfp4 * GGML_RESTRICT x = vx;
278
+ const block_q8_0 * GGML_RESTRICT y = vy;
279
+
280
+ int ib = 0;
281
+ float sumf = 0.0f;
282
+
283
+ #if defined(__VXE__) || defined(__VXE2__)
284
+ const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
285
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
286
+
287
+ float32x4_t v_acc = vec_splats(0.0f);
288
+
289
+ #pragma GCC unroll 8
290
+ for (; ib + 1 < nb; ib += 2) {
291
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
292
+ const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
293
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
294
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
295
+
296
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
297
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
298
+
299
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
300
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
301
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
302
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
303
+
304
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
305
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
306
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
307
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
308
+
309
+ const int8x16_t v_y0l = vec_xl(0, y0->qs);
310
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
311
+ const int8x16_t v_y1l = vec_xl(0, y1->qs);
312
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
313
+
314
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
315
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
316
+
317
+ const float32x4_t v_xy0f = vec_float(v_xy0);
318
+ const float32x4_t v_xy1f = vec_float(v_xy1);
319
+
320
+ const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
321
+ const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
322
+
323
+ v_acc = vec_madd(v_xy0f, v_d0, v_acc);
324
+ v_acc = vec_madd(v_xy1f, v_d1, v_acc);
325
+ }
326
+
327
+ for (; ib < nb; ++ib) {
328
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
329
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
330
+
331
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
332
+
333
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
334
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
335
+
336
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
337
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
338
+
339
+ const int8x16_t v_yl = vec_xl(0, y0->qs);
340
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
341
+
342
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
343
+ const float32x4_t v_xyf = vec_float(v_xy);
344
+
345
+ const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
346
+ v_acc = vec_madd(v_xyf, v_d, v_acc);
347
+ }
348
+
349
+ sumf = vec_hsum_f32x4(v_acc);
350
+ *s = sumf;
351
+ #else
352
+ UNUSED(x);
353
+ UNUSED(y);
354
+ UNUSED(ib);
355
+ UNUSED(sumf);
356
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
357
+ #endif
358
+ }
359
+
360
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
361
+ const int qk = QK8_0;
362
+ const int nb = n / qk;
363
+
364
+ assert(n % qk == 0);
365
+ assert(qk == QK5_0);
366
+ assert(nrc == 1);
367
+ UNUSED(nrc);
368
+ UNUSED(bx);
369
+ UNUSED(by);
370
+ UNUSED(bs);
371
+
372
+ const block_q5_0 * GGML_RESTRICT x = vx;
373
+ const block_q8_0 * GGML_RESTRICT y = vy;
374
+
375
+ int ib = 0;
376
+ float sumf = 0.0f;
377
+
378
+ #if defined(__VXE__) || defined(__VXE2__)
379
+ float32x4_t v_sum0 = vec_splats(0.0f);
380
+ float32x4_t v_sum1 = vec_splats(0.0f);
381
+
382
+ uint32_t qh0, qh1;
383
+ uint64_t tmp0[4], tmp1[4];
384
+
385
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
386
+
387
+ #pragma GCC unroll 4
388
+ for (; ib + 1 < nb; ib += 2) {
389
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
390
+ const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
391
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
392
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
393
+
394
+ memcpy(&qh0, x0->qh, sizeof(qh0));
395
+ memcpy(&qh1, x1->qh, sizeof(qh1));
396
+
397
+ tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
398
+ tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
399
+ tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
400
+ tmp0[3] = table_b2b_1[(qh0 >> 24) ];
401
+
402
+ tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
403
+ tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
404
+ tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
405
+ tmp1[3] = table_b2b_1[(qh1 >> 24) ];
406
+
407
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
408
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
409
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
410
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
411
+
412
+ // required for fixing the byteorder
413
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
414
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
415
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
416
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
417
+
418
+ const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
419
+ const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
420
+
421
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
422
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
423
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
424
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
425
+
426
+ const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
427
+ const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
428
+ const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
429
+ const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
430
+
431
+ const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
432
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
433
+ const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
434
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
435
+
436
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
437
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
438
+
439
+ const float32x4_t v_xy0f = vec_float(v_xy0);
440
+ const float32x4_t v_xy1f = vec_float(v_xy1);
441
+
442
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
443
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
444
+
445
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
446
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
447
+ }
448
+
449
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
450
+
451
+ #pragma GCC unroll 4
452
+ for (; ib < nb; ++ib) {
453
+ const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
454
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
455
+
456
+ uint32_t qh;
457
+ memcpy(&qh, x0->qh, sizeof(qh));
458
+
459
+ uint64_t tmp[4];
460
+ tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
461
+ tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
462
+ tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
463
+ tmp[3] = table_b2b_1[(qh >> 24) ];
464
+
465
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
466
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
467
+
468
+ // required for fixing the byteorder
469
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
470
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
471
+
472
+ const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
473
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
474
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
475
+
476
+ const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
477
+ const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
478
+
479
+ const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
480
+ const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
481
+
482
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
483
+ const float32x4_t v_xyf = vec_float(v_xy);
484
+
485
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
486
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
487
+
488
+ sumf += vec_hsum_f32x4(v_acc);
489
+ }
490
+
491
+ *s = sumf;
492
+ #else
493
+ UNUSED(nb);
494
+ UNUSED(x);
495
+ UNUSED(y);
496
+ UNUSED(ib);
497
+ UNUSED(sumf);
498
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
499
+ #endif
500
+ }
501
+
502
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
503
+ const int qk = QK8_1;
504
+ const int nb = n / qk;
505
+
506
+ assert(n % qk == 0);
507
+ assert(qk == QK5_1);
508
+ assert(nrc == 1);
509
+ UNUSED(nrc);
510
+ UNUSED(bx);
511
+ UNUSED(by);
512
+ UNUSED(bs);
513
+
514
+ const block_q5_1 * GGML_RESTRICT x = vx;
515
+ const block_q8_1 * GGML_RESTRICT y = vy;
516
+
517
+ int ib = 0;
518
+ float sumf = 0.0f;
519
+
520
+ #if defined(__VXE__) || defined(__VXE2__)
521
+ float32x4_t v_sum0 = vec_splats(0.0f);
522
+ float32x4_t v_sum1 = vec_splats(0.0f);
523
+
524
+ float summs0 = 0.0f;
525
+ float summs1 = 0.0f;
526
+
527
+ uint32_t qh0;
528
+ uint32_t qh1;
529
+
530
+ uint64_t tmp0[4];
531
+ uint64_t tmp1[4];
532
+
533
+ const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
534
+
535
+ #pragma GCC unroll 4
536
+ for (; ib + 1 < nb; ib += 2) {
537
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
538
+ const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
539
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
540
+ const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
541
+
542
+ summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
543
+ summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
544
+
545
+ memcpy(&qh0, x0->qh, sizeof(qh0));
546
+ memcpy(&qh1, x1->qh, sizeof(qh1));
547
+
548
+ tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
549
+ tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
550
+ tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
551
+ tmp0[3] = table_b2b_0[(qh0 >> 24) ];
552
+
553
+ tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
554
+ tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
555
+ tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
556
+ tmp1[3] = table_b2b_0[(qh1 >> 24) ];
557
+
558
+ int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
559
+ int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
560
+ int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
561
+ int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
562
+
563
+ // required for fixing the byteorder
564
+ v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
565
+ v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
566
+ v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
567
+ v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
568
+
569
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
570
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
571
+
572
+ const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
573
+ const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
574
+ const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
575
+ const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
576
+
577
+ const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
578
+ const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
579
+ const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
580
+ const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
581
+
582
+ const int8x16_t v_y0l = vec_xl(0 , y0->qs);
583
+ const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
584
+ const int8x16_t v_y1l = vec_xl(0 , y1->qs);
585
+ const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
586
+
587
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
588
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
589
+
590
+ const float32x4_t v_xy0f = vec_float(v_xy0);
591
+ const float32x4_t v_xy1f = vec_float(v_xy1);
592
+
593
+ const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
594
+ const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
595
+
596
+ v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
597
+ v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
598
+ }
599
+
600
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
601
+
602
+ #pragma GCC unroll 4
603
+ for (; ib < nb; ++ib) {
604
+ const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
605
+ const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
606
+
607
+ float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
608
+
609
+ uint32_t qh;
610
+ memcpy(&qh, x0->qh, sizeof(qh));
611
+
612
+ uint64_t tmp[4];
613
+ tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
614
+ tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
615
+ tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
616
+ tmp[3] = table_b2b_0[(qh >> 24) ];
617
+
618
+ int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
619
+ int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
620
+
621
+ // required for fixing the byteorder
622
+ v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
623
+ v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
624
+
625
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
626
+ const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
627
+ const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
628
+
629
+ const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
630
+ const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
631
+
632
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
633
+ const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
634
+
635
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
636
+ const float32x4_t v_xyf = vec_float(v_xy);
637
+
638
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
639
+ const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
640
+
641
+ sumf += vec_hsum_f32x4(v_acc) + summs;
642
+ }
643
+
644
+ *s = sumf;
645
+ #else
646
+ UNUSED(nb);
647
+ UNUSED(x);
648
+ UNUSED(y);
649
+ UNUSED(ib);
650
+ UNUSED(sumf);
651
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
652
+ #endif
653
+ }
654
+
655
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
656
+ const int qk = QK8_0;
657
+ const int nb = n / qk;
658
+
659
+ assert(n % qk == 0);
660
+ assert(nrc == 1);
661
+ UNUSED(nrc);
662
+ UNUSED(bx);
663
+ UNUSED(by);
664
+ UNUSED(bs);
665
+
666
+ const block_q8_0 * GGML_RESTRICT x = vx;
667
+ const block_q8_0 * GGML_RESTRICT y = vy;
668
+
669
+ int ib = 0;
670
+ float sumf = 0;
671
+
672
+ #if defined(__VXE__) || defined(__VXE2__)
673
+ float32x4_t acc = vec_splats(0.0f);
674
+
675
+ #pragma GCC unroll 8
676
+ for (; ib < nb; ++ib) {
677
+ __builtin_prefetch(x[ib].qs, 0, 1);
678
+ __builtin_prefetch(y[ib].qs, 0, 1);
679
+
680
+ const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
681
+ const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
682
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
683
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
684
+
685
+ const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
686
+ const float32x4_t v_xy = vec_float(v_xy_);
687
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
688
+
689
+ acc = vec_madd(v_xy, v_d, acc);
690
+ }
691
+
692
+ sumf = vec_hsum_f32x4(acc);
693
+
694
+ *s = sumf;
695
+ #else
696
+ UNUSED(nb);
697
+ UNUSED(x);
698
+ UNUSED(y);
699
+ UNUSED(ib);
700
+ UNUSED(sumf);
701
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
702
+ #endif
703
+ }
704
+
705
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
706
+ assert(n % QK_K == 0);
707
+ assert(nrc == 1);
708
+ UNUSED(nrc);
709
+ UNUSED(bx);
710
+ UNUSED(by);
711
+ UNUSED(bs);
712
+
713
+ const uint32_t kmask1 = 0x03030303;
714
+ const uint32_t kmask2 = 0x0f0f0f0f;
715
+
716
+ const block_q3_K * GGML_RESTRICT x = vx;
717
+ const block_q8_K * GGML_RESTRICT y = vy;
718
+
719
+ const int nb = n / QK_K;
720
+
721
+ #if defined(__VXE__) || defined(__VXE2__)
722
+ uint32_t aux[3];
723
+ uint32_t utmp[4];
724
+
725
+ const int32x4_t v_z = vec_splat_s32(0);
726
+ const uint8x16_t v_3m = vec_splat_u8(0x03);
727
+
728
+ const uint8x16_t v_0c = vec_splat_u8(1);
729
+ const uint8x16_t v_1c = vec_sl(v_0c, 1);
730
+ const uint8x16_t v_2c = vec_sl(v_0c, 2);
731
+ const uint8x16_t v_3c = vec_sl(v_0c, 3);
732
+
733
+ uint8x16_t q3h[4];
734
+ uint8x16_t q3b[2];
735
+ int8x16_t q3bytes[4];
736
+ int8x16_t q8bytes[8];
737
+ uint8x16_t qhbits[2];
738
+
739
+ float sum = 0;
740
+
741
+ for (int i = 0; i < nb; ++i) {
742
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
743
+
744
+ const uint8_t * restrict x0l = x[i].qs;
745
+ const uint8_t * restrict x0h = x[i].hmask;
746
+ const int8_t * restrict y0 = y[i].qs;
747
+
748
+ qhbits[0] = vec_xl(0 , x0h);
749
+ qhbits[1] = vec_xl(16, x0h);
750
+
751
+ int32_t isum = 0;
752
+
753
+ memcpy(aux, x[i].scales, 12);
754
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
755
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
756
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
757
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
758
+
759
+ int8_t * scale = (int8_t *)utmp;
760
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
761
+
762
+ for (int j = 0; j < QK_K/128; ++j) {
763
+ int32x4_t isum0, isum1, isum2, isum3;
764
+
765
+ q3b[0] = vec_xl(0 , x0l);
766
+ q3b[1] = vec_xl(16, x0l);
767
+ x0l += 32;
768
+
769
+ q8bytes[0] = vec_xl(0 , y0);
770
+ q8bytes[1] = vec_xl(16 , y0);
771
+ q8bytes[2] = vec_xl(32 , y0);
772
+ q8bytes[3] = vec_xl(48 , y0);
773
+ q8bytes[4] = vec_xl(64 , y0);
774
+ q8bytes[5] = vec_xl(80 , y0);
775
+ q8bytes[6] = vec_xl(96 , y0);
776
+ q8bytes[7] = vec_xl(112, y0);
777
+ y0 += 128;
778
+
779
+ q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
780
+ q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
781
+ q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
782
+ q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
783
+
784
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
785
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
786
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
787
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
788
+
789
+ isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
790
+ isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
791
+ isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
792
+ isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
793
+
794
+ isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
795
+ isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
796
+ isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
797
+ isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
798
+
799
+ scale += 4;
800
+
801
+ q3h[0] = vec_andc(v_2c, qhbits[0]);
802
+ q3h[1] = vec_andc(v_2c, qhbits[1]);
803
+ q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
804
+ q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
805
+
806
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
807
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
808
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
809
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
810
+
811
+ isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
812
+ isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
813
+ isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
814
+ isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
815
+
816
+ isum += vec_hsum_i32x4(isum0) * scale[0];
817
+ isum += vec_hsum_i32x4(isum1) * scale[1];
818
+ isum += vec_hsum_i32x4(isum2) * scale[2];
819
+ isum += vec_hsum_i32x4(isum3) * scale[3];
820
+
821
+ scale += 4;
822
+
823
+ if (j == 0) {
824
+ qhbits[0] = vec_sr(qhbits[0], 4);
825
+ qhbits[1] = vec_sr(qhbits[1], 4);
826
+ }
827
+ }
828
+
829
+ sum += d * isum;
830
+ }
831
+
832
+ *s = sum;
833
+
834
+ #else
835
+ UNUSED(kmask1);
836
+ UNUSED(kmask2);
837
+ UNUSED(x);
838
+ UNUSED(y);
839
+ UNUSED(nb);
840
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
841
+ #endif
842
+ }
843
+
844
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
845
+ assert(n % QK_K == 0);
846
+ assert(nrc == 1);
847
+ UNUSED(nrc);
848
+ UNUSED(bx);
849
+ UNUSED(by);
850
+ UNUSED(bs);
851
+
852
+ const block_q4_K * GGML_RESTRICT x = vx;
853
+ const block_q8_K * GGML_RESTRICT y = vy;
854
+
855
+ const int nb = n / QK_K;
856
+
857
+ static const uint32_t kmask1 = 0x3f3f3f3f;
858
+ static const uint32_t kmask2 = 0x0f0f0f0f;
859
+ static const uint32_t kmask3 = 0x03030303;
860
+
861
+ uint32_t utmp[4];
862
+
863
+ #if defined(__VXE__) || defined(__VXE2__)
864
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
865
+ const int32x4_t v_z = vec_splat_s32(0);
866
+
867
+ uint8x16_t v_x[2];
868
+ int8x16_t v_xl[2];
869
+ int8x16_t v_y[2];
870
+
871
+ float sumf = 0;
872
+
873
+ for (int i = 0; i < nb; ++i) {
874
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
875
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
876
+
877
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
878
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
879
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
880
+
881
+ memcpy(utmp, x[i].scales, 12);
882
+
883
+ uint32x4_t v_mins8 = { 0 };
884
+ v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
885
+ v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
886
+
887
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
888
+ utmp[0] &= kmask1;
889
+
890
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
891
+
892
+ const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
893
+ const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
894
+ const int32x4_t v_mins = v_minso + v_minse;
895
+ sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
896
+
897
+ const uint8_t * scales = (const uint8_t *)utmp;
898
+ const uint8_t * GGML_RESTRICT x0 = x[i].qs;
899
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
900
+
901
+ int32_t sumi1 = 0;
902
+ int32_t sumi2 = 0;
903
+
904
+ for (int j = 0; j < QK_K/64; ++j) {
905
+ v_x[0] = vec_xl(0 , x0);
906
+ v_x[1] = vec_xl(16, x0);
907
+ x0 += 32;
908
+
909
+ v_y[0] = vec_xl(0 , y0);
910
+ v_y[1] = vec_xl(16, y0);
911
+ y0 += 32;
912
+
913
+ v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
914
+ v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
915
+
916
+ const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
917
+ sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
918
+
919
+ v_y[0] = vec_xl(0 , y0);
920
+ v_y[1] = vec_xl(16, y0);
921
+ y0 += 32;
922
+
923
+ v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
924
+ v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
925
+
926
+ const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
927
+ sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
928
+ }
929
+
930
+ sumf += d * (sumi1 + sumi2);
931
+ }
932
+
933
+ *s = sumf;
934
+
935
+ #else
936
+ UNUSED(x);
937
+ UNUSED(y);
938
+ UNUSED(nb);
939
+ UNUSED(kmask1);
940
+ UNUSED(kmask2);
941
+ UNUSED(kmask3);
942
+ UNUSED(utmp);
943
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
944
+ #endif
945
+ }
946
+
947
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
948
+ assert(n % QK_K == 0);
949
+ assert(nrc == 1);
950
+ UNUSED(nrc);
951
+ UNUSED(bx);
952
+ UNUSED(by);
953
+ UNUSED(bs);
954
+
955
+ const block_q5_K * GGML_RESTRICT x = vx;
956
+ const block_q8_K * GGML_RESTRICT y = vy;
957
+
958
+ const int nb = n / QK_K;
959
+
960
+ static const uint32_t kmask1 = 0x3f3f3f3f;
961
+ static const uint32_t kmask2 = 0x0f0f0f0f;
962
+ static const uint32_t kmask3 = 0x03030303;
963
+
964
+ uint32_t utmp[4];
965
+
966
+ #if defined(__VXE__) || defined(__VXE2__)
967
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
968
+ const uint8x16_t v_1m = vec_splat_u8(0x01);
969
+ const uint8x16_t v_2m = vec_splat_u8(0x02);
970
+
971
+ const int32x4_t v_z = vec_splat_s32(0);
972
+
973
+ const uchar8x16_t v_minsm = {
974
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
975
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
976
+ };
977
+
978
+ int8x16_t q5b[4];
979
+ uint8x16_t q5h[4];
980
+
981
+ uint8x16_t v_xl[2];
982
+ uint8x16_t v_xh[2];
983
+ int8x16_t v_y[4];
984
+
985
+ float sumf = 0;
986
+
987
+ for (int i = 0; i < nb; ++i) {
988
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
989
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
990
+
991
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
992
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
993
+ const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
994
+
995
+ memcpy(utmp, x[i].scales, 12);
996
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
997
+ const uint32_t uaux = utmp[1] & kmask1;
998
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
999
+ utmp[2] = uaux;
1000
+ utmp[0] &= kmask1;
1001
+
1002
+ const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
1003
+ const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
1004
+ const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
1005
+
1006
+ const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
1007
+ const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
1008
+ const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
1009
+ const int32_t mins = vec_hsum_i32x4(v_mins);
1010
+
1011
+ const uint8_t * scales = (const uint8_t *)utmp;
1012
+ const uint8_t * GGML_RESTRICT x0l = x[i].qs;
1013
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1014
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
1015
+
1016
+ v_xh[0] = vec_xl(0 , x0h);
1017
+ v_xh[1] = vec_xl(16, x0h);
1018
+
1019
+ int32_t sumi = 0;
1020
+ for (int j = 0; j < QK_K/64; ++j) {
1021
+ v_xl[0] = vec_xl(0 , x0l);
1022
+ v_xl[1] = vec_xl(16, x0l);
1023
+ x0l += 32;
1024
+
1025
+ v_y[0] = vec_xl(0 , y0);
1026
+ v_y[1] = vec_xl(16, y0);
1027
+ v_y[2] = vec_xl(32, y0);
1028
+ v_y[3] = vec_xl(48, y0);
1029
+ y0 += 64;
1030
+
1031
+ q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
1032
+ q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
1033
+ q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
1034
+ q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
1035
+ v_xh[0] = vec_sr(v_xh[0], 2);
1036
+ v_xh[1] = vec_sr(v_xh[1], 2);
1037
+
1038
+ q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
1039
+ q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
1040
+ q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
1041
+ q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
1042
+
1043
+ int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
1044
+ int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
1045
+
1046
+ sumi += vec_hsum_i32x4(sumi0) * *scales++;
1047
+ sumi += vec_hsum_i32x4(sumi1) * *scales++;
1048
+ }
1049
+
1050
+ sumf += d * sumi - dmin * mins;
1051
+ }
1052
+
1053
+ *s = sumf;
1054
+
1055
+ #else
1056
+ UNUSED(x);
1057
+ UNUSED(y);
1058
+ UNUSED(nb);
1059
+ UNUSED(kmask1);
1060
+ UNUSED(kmask2);
1061
+ UNUSED(kmask3);
1062
+ UNUSED(utmp);
1063
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1064
+ #endif
1065
+ }
1066
+
1067
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1068
+ assert(n % QK_K == 0);
1069
+ assert(nrc == 1);
1070
+ UNUSED(nrc);
1071
+ UNUSED(bx);
1072
+ UNUSED(by);
1073
+ UNUSED(bs);
1074
+
1075
+ const block_q6_K * GGML_RESTRICT x = vx;
1076
+ const block_q8_K * GGML_RESTRICT y = vy;
1077
+
1078
+ const int nb = n / QK_K;
1079
+
1080
+ #if defined(__VXE__) || defined(__VXE2__)
1081
+ float sum = 0;
1082
+
1083
+ // Lower 4-bit and upper 2-bit masks
1084
+ const uint8x16_t v_lm = vec_splat_u8(0x0F);
1085
+ const uint8x16_t v_um = vec_splat_u8(0x03);
1086
+
1087
+ const int32x4_t v_z = vec_splat_s32(0);
1088
+
1089
+ int8x16_t q6b[4];
1090
+ uint8x16_t q6h[4];
1091
+
1092
+ uint8x16_t v_xl[4];
1093
+ uint8x16_t v_xh[2];
1094
+ int8x16_t v_y[4];
1095
+
1096
+ for (int i = 0; i < nb; ++i) {
1097
+ const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
1098
+
1099
+ const uint8_t * GGML_RESTRICT x0l = x[i].ql;
1100
+ const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1101
+ const int8_t * GGML_RESTRICT y0 = y[i].qs;
1102
+
1103
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
1104
+
1105
+ const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
1106
+ const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
1107
+
1108
+ const int8x16_t v_scale = vec_xl(0, scale);
1109
+ const int16x8_t v_scalel = vec_unpackh(v_scale);
1110
+ const int16x8_t v_scaleh = vec_unpackl(v_scale);
1111
+
1112
+ const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
1113
+ const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
1114
+ const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
1115
+ const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
1116
+ const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
1117
+
1118
+ const int32_t mins = vec_hsum_i32x4(v_mins);
1119
+
1120
+ int32_t isum = 0;
1121
+ for (int j = 0; j < QK_K/128; ++j) {
1122
+ // Load model upper 2 bits
1123
+ v_xh[0] = vec_xl(0 , x0h);
1124
+ v_xh[1] = vec_xl(16, x0h);
1125
+ x0h += 32;
1126
+
1127
+ // Load model lower 4 bits
1128
+ v_xl[0] = vec_xl(0 , x0l);
1129
+ v_xl[1] = vec_xl(16, x0l);
1130
+ v_xl[2] = vec_xl(32, x0l);
1131
+ v_xl[3] = vec_xl(48, x0l);
1132
+ x0l += 64;
1133
+
1134
+ // Load activation quants
1135
+ v_y[0] = vec_xl(0 , y0);
1136
+ v_y[1] = vec_xl(16, y0);
1137
+ v_y[2] = vec_xl(32, y0);
1138
+ v_y[3] = vec_xl(48, y0);
1139
+ y0 += 64;
1140
+
1141
+ q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
1142
+ q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
1143
+ uint8x16_t shifted = vec_sr(v_xh[0], 2);
1144
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1145
+ shifted = vec_sr(v_xh[1], 2);
1146
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1147
+
1148
+ q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
1149
+ q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
1150
+ q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
1151
+ q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
1152
+
1153
+ int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1154
+ int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1155
+ int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1156
+ int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1157
+
1158
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1159
+ vec_hsum_i32x4(summs1) * scale[1] +
1160
+ vec_hsum_i32x4(summs2) * scale[2] +
1161
+ vec_hsum_i32x4(summs3) * scale[3];
1162
+
1163
+ scale += 4;
1164
+
1165
+
1166
+ // Load activation quants
1167
+ v_y[0] = vec_xl(0 , y0);
1168
+ v_y[1] = vec_xl(16, y0);
1169
+ v_y[2] = vec_xl(32, y0);
1170
+ v_y[3] = vec_xl(48, y0);
1171
+ y0 += 64;
1172
+
1173
+ shifted = vec_sr(v_xh[0], 4);
1174
+ q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
1175
+ shifted = vec_sr(v_xh[1], 4);
1176
+ q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
1177
+ shifted = vec_sr(v_xh[0], 6);
1178
+ q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1179
+ shifted = vec_sr(v_xh[1], 6);
1180
+ q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1181
+
1182
+ q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
1183
+ q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
1184
+ q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
1185
+ q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
1186
+
1187
+ summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1188
+ summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1189
+ summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1190
+ summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1191
+
1192
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1193
+ vec_hsum_i32x4(summs1) * scale[1] +
1194
+ vec_hsum_i32x4(summs2) * scale[2] +
1195
+ vec_hsum_i32x4(summs3) * scale[3];
1196
+
1197
+ scale += 4;
1198
+ }
1199
+
1200
+ sum += d_all * y[i].d * (isum - 32 * mins);
1201
+ }
1202
+
1203
+ *s = sum;
1204
+
1205
+ #else
1206
+ UNUSED(x);
1207
+ UNUSED(y);
1208
+ UNUSED(nb);
1209
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1210
+ #endif
1211
+ }
1212
+
1213
+ // #if defined(__VXE__) || defined(__VXE2__)
1214
+ // static const int8_t keven_signs_q2xs[1024] = {
1215
+ // 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1216
+ // 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
1217
+ // 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
1218
+ // 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1219
+ // 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
1220
+ // 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
1221
+ // 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
1222
+ // 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
1223
+ // 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
1224
+ // 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
1225
+ // 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
1226
+ // 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
1227
+ // 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
1228
+ // 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
1229
+ // 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
1230
+ // 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
1231
+ // 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
1232
+ // 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
1233
+ // 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
1234
+ // 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
1235
+ // 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
1236
+ // 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
1237
+ // 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
1238
+ // 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
1239
+ // 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
1240
+ // 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
1241
+ // 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
1242
+ // 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
1243
+ // 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
1244
+ // 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
1245
+ // 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
1246
+ // 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
1247
+ // };
1248
+ // #endif
1249
+
1250
+ // void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1251
+ // assert(n % QK_K == 0);
1252
+ // assert(nrc == 1);
1253
+ // UNUSED(nrc);
1254
+ // UNUSED(bx);
1255
+ // UNUSED(by);
1256
+ // UNUSED(bs);
1257
+
1258
+ // const block_iq2_xxs * GGML_RESTRICT x = vx;
1259
+ // const block_q8_K * GGML_RESTRICT y = vy;
1260
+
1261
+ // const int nb = n / QK_K;
1262
+
1263
+ // #if defined(__VXE__) || defined(__VXE2__)
1264
+ // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1265
+
1266
+ // uint32_t aux32[4];
1267
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
1268
+
1269
+ // float sumf = 0;
1270
+
1271
+ // for (int i = 0; i < nb; ++i) {
1272
+ // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1273
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1274
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1275
+
1276
+ // float sumf1 = 0, sumf2 = 0;
1277
+
1278
+ // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
1279
+ // int8x16_t q8b0 = vec_xl( 0, q8);
1280
+ // int8x16_t qb81 = vec_xl(16, q8);
1281
+ // int8x16_t q8b2 = vec_xl(32, q8);
1282
+ // int8x16_t q8b3 = vec_xl(48, q8);
1283
+ // q8 += 64;
1284
+
1285
+ // memcpy(aux32, q2, 4 * sizeof(uint32_t));
1286
+ // q2 += 8;
1287
+
1288
+ // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
1289
+ // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
1290
+ // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
1291
+ // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
1292
+
1293
+ // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
1294
+ // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
1295
+ // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
1296
+ // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
1297
+
1298
+ // q2u0 = vec_mul(q2u0, q2s0);
1299
+ // q2u1 = vec_mul(q2u1, q2s1);
1300
+ // q2u2 = vec_mul(q2u2, q2s2);
1301
+ // q2u3 = vec_mul(q2u3, q2s3);
1302
+
1303
+ // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
1304
+ // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
1305
+
1306
+ // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
1307
+ // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
1308
+ // }
1309
+
1310
+ // sumf += d * (sumf1 + sumf2);
1311
+ // }
1312
+
1313
+ // *s = 0.25f * sumf;
1314
+
1315
+ // #else
1316
+
1317
+ // uint32_t aux32[2];
1318
+ // const uint8_t * aux8 = (const uint8_t *)aux32;
1319
+
1320
+ // float sumf = 0.f;
1321
+ // for (int i = 0; i < nb; ++i) {
1322
+ // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1323
+ // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1324
+ // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1325
+ // int32_t bsum = 0;
1326
+ // for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1327
+ // memcpy(aux32, q2, 2*sizeof(uint32_t));
1328
+ // q2 += 4;
1329
+ // const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1330
+ // int32_t sumi = 0;
1331
+ // for (int l = 0; l < 4; ++l) {
1332
+ // const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1333
+ // const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1334
+ // for (int j = 0; j < 8; ++j) {
1335
+ // sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1336
+ // }
1337
+ // q8 += 8;
1338
+ // }
1339
+ // bsum += sumi * ls;
1340
+ // }
1341
+ // sumf += d * bsum;
1342
+ // }
1343
+ // *s = 0.125f * sumf;
1344
+ // #endif
1345
+ // }
1346
+
1347
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1348
+ assert(nrc == 1);
1349
+ UNUSED(nrc);
1350
+ UNUSED(bx);
1351
+ UNUSED(by);
1352
+ UNUSED(bs);
1353
+ assert(n % QK4_NL == 0);
1354
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1355
+
1356
+ const block_iq4_nl * GGML_RESTRICT x = vx;
1357
+ const block_q8_0 * GGML_RESTRICT y = vy;
1358
+
1359
+ const int nb = n / QK4_NL;
1360
+
1361
+ int ib = 0;
1362
+ float sumf = 0;
1363
+
1364
+ #if defined(__VXE__) || defined(__VXE2__)
1365
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1366
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
1367
+
1368
+ for (; ib < nb; ++ib) {
1369
+ const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
1370
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
1371
+
1372
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
1373
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
1374
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
1375
+
1376
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
1377
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
1378
+
1379
+ const int8x16_t v_yl = vec_xl(0 , y0->qs);
1380
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1381
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1382
+
1383
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1384
+ }
1385
+
1386
+ *s = sumf;
1387
+ #else
1388
+ UNUSED(x);
1389
+ UNUSED(y);
1390
+ UNUSED(nb);
1391
+ UNUSED(ib);
1392
+ UNUSED(sumf);
1393
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
1394
+ #endif
1395
+ }
1396
+
1397
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1398
+ assert(nrc == 1);
1399
+ UNUSED(nrc);
1400
+ UNUSED(bx);
1401
+ UNUSED(by);
1402
+ UNUSED(bs);
1403
+ assert(n % QK_K == 0);
1404
+
1405
+ const block_iq4_xs * GGML_RESTRICT x = vx;
1406
+ const block_q8_K * GGML_RESTRICT y = vy;
1407
+
1408
+ const int nb = n / QK_K;
1409
+
1410
+ #if defined(__VXE__) || defined(__VXE2__)
1411
+ const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1412
+ const uint8x16_t v_m = vec_splat_u8(0x0F);
1413
+
1414
+ float sumf = 0;
1415
+
1416
+ for (int ibl = 0; ibl < nb; ++ibl) {
1417
+ const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
1418
+ const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
1419
+
1420
+ uint16_t h = x[ibl].scales_h;
1421
+
1422
+ int sumi1 = 0, sumi2 = 0;
1423
+ for (int ib = 0; ib < QK_K/64; ++ib) {
1424
+ const uint8x16_t v_x0 = vec_xl(0 , q4);
1425
+ const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
1426
+ q4 += 32;
1427
+
1428
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
1429
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
1430
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
1431
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
1432
+
1433
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
1434
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
1435
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
1436
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
1437
+
1438
+ const int8x16_t v_y0 = vec_xl( 0, q8);
1439
+ const int8x16_t v_y1 = vec_xl(16, q8);
1440
+ const int8x16_t v_y2 = vec_xl(32, q8);
1441
+ const int8x16_t v_y3 = vec_xl(48, q8);
1442
+ q8 += 64;
1443
+
1444
+ int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
1445
+ int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
1446
+
1447
+ int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
1448
+ int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
1449
+
1450
+ h >>= 4;
1451
+
1452
+ sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1453
+ sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1454
+ }
1455
+
1456
+ sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1457
+ }
1458
+
1459
+ *s = sumf;
1460
+
1461
+ #else
1462
+ UNUSED(x);
1463
+ UNUSED(y);
1464
+ UNUSED(nb);
1465
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1466
+ #endif
1467
+ }
1468
+