whispercpp 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (586) hide show
  1. checksums.yaml +4 -4
  2. data/ext/ruby_whisper_params.c +55 -25
  3. data/ext/sources/CMakeLists.txt +1 -1
  4. data/ext/sources/bindings/javascript/package.json +1 -1
  5. data/ext/sources/build-xcframework.sh +24 -0
  6. data/ext/sources/examples/CMakeLists.txt +1 -0
  7. data/ext/sources/examples/addon.node/addon.cpp +19 -19
  8. data/ext/sources/examples/addon.node/index.js +7 -5
  9. data/ext/sources/examples/bench/bench.cpp +26 -16
  10. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  11. data/ext/sources/examples/cli/cli.cpp +4 -2
  12. data/ext/sources/examples/command/command.cpp +26 -24
  13. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  14. data/ext/sources/examples/common-ggml.cpp +2 -0
  15. data/ext/sources/examples/lsp/lsp.cpp +19 -17
  16. data/ext/sources/examples/server/server.cpp +24 -13
  17. data/ext/sources/examples/server.py +6 -1
  18. data/ext/sources/examples/stream/stream.cpp +4 -2
  19. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  20. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  21. data/ext/sources/examples/talk-llama/CMakeLists.txt +2 -2
  22. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  23. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  24. data/ext/sources/examples/talk-llama/llama-arch.cpp +588 -15
  25. data/ext/sources/examples/talk-llama/llama-arch.h +58 -1
  26. data/ext/sources/examples/talk-llama/llama-batch.cpp +103 -71
  27. data/ext/sources/examples/talk-llama/llama-batch.h +31 -18
  28. data/ext/sources/examples/talk-llama/llama-chat.cpp +120 -5
  29. data/ext/sources/examples/talk-llama/llama-chat.h +7 -0
  30. data/ext/sources/examples/talk-llama/llama-context.cpp +460 -357
  31. data/ext/sources/examples/talk-llama/llama-context.h +44 -29
  32. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  33. data/ext/sources/examples/talk-llama/llama-graph.cpp +543 -271
  34. data/ext/sources/examples/talk-llama/llama-graph.h +278 -168
  35. data/ext/sources/examples/talk-llama/llama-hparams.cpp +118 -4
  36. data/ext/sources/examples/talk-llama/llama-hparams.h +61 -15
  37. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  38. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  39. data/ext/sources/examples/talk-llama/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +38 -29
  40. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +2020 -0
  41. data/ext/sources/examples/talk-llama/llama-kv-cache.h +358 -27
  42. data/ext/sources/examples/talk-llama/llama-kv-cells.h +80 -28
  43. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +56 -36
  44. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +30 -29
  45. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +48 -19
  46. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +13 -14
  47. data/ext/sources/examples/talk-llama/llama-memory.h +16 -10
  48. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +2 -0
  49. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  50. data/ext/sources/examples/talk-llama/llama-model.cpp +7165 -2336
  51. data/ext/sources/examples/talk-llama/llama-model.h +60 -9
  52. data/ext/sources/examples/talk-llama/llama-quant.cpp +48 -10
  53. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  54. data/ext/sources/examples/talk-llama/llama-vocab.cpp +440 -13
  55. data/ext/sources/examples/talk-llama/llama-vocab.h +45 -0
  56. data/ext/sources/examples/talk-llama/llama.cpp +65 -10
  57. data/ext/sources/examples/talk-llama/llama.h +95 -177
  58. data/ext/sources/examples/talk-llama/talk-llama.cpp +9 -6
  59. data/ext/sources/examples/talk-llama/unicode.cpp +207 -0
  60. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  61. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +4 -2
  62. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  63. data/ext/sources/ggml/CMakeLists.txt +59 -31
  64. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  65. data/ext/sources/ggml/include/ggml-backend.h +17 -1
  66. data/ext/sources/ggml/include/ggml-cpu.h +1 -1
  67. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  68. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  69. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  70. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  71. data/ext/sources/ggml/include/ggml.h +221 -16
  72. data/ext/sources/ggml/src/CMakeLists.txt +17 -2
  73. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  74. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  75. data/ext/sources/ggml/src/ggml-backend-reg.cpp +30 -13
  76. data/ext/sources/ggml/src/ggml-backend.cpp +221 -38
  77. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  78. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  79. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  80. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  81. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  82. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  83. data/ext/sources/ggml/src/ggml-cann/common.h +143 -1
  84. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +488 -69
  85. data/ext/sources/ggml/src/ggml-common.h +17 -0
  86. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +40 -18
  87. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  90. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +103 -582
  91. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  92. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +265 -437
  93. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  94. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +521 -353
  95. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  96. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  97. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  98. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +32 -2
  99. data/ext/sources/ggml/src/ggml-cpu/common.h +14 -0
  100. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +13 -6
  101. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +70 -42
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +35 -28
  103. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  104. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  105. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +227 -97
  106. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +474 -1116
  107. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1587 -1177
  108. data/ext/sources/ggml/src/ggml-cpu/ops.h +5 -8
  109. data/ext/sources/ggml/src/ggml-cpu/quants.c +35 -0
  110. data/ext/sources/ggml/src/ggml-cpu/quants.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +458 -47
  112. data/ext/sources/ggml/src/ggml-cpu/repack.h +22 -0
  113. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +89 -60
  114. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  115. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  116. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  117. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  118. data/ext/sources/ggml/src/ggml-cpu/traits.cpp +2 -2
  119. data/ext/sources/ggml/src/ggml-cpu/traits.h +1 -1
  120. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +170 -26
  121. data/ext/sources/ggml/src/ggml-cpu/vec.h +506 -63
  122. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  123. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  124. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  125. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  126. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  127. data/ext/sources/ggml/src/ggml-cuda/common.cuh +250 -63
  128. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  129. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  130. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  131. data/ext/sources/ggml/src/ggml-cuda/convert.cu +95 -22
  132. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +15 -0
  133. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  134. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  135. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  136. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  137. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +498 -367
  138. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +137 -91
  139. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  140. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  141. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  142. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +86 -50
  143. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  144. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  145. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  146. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +379 -107
  147. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  148. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  149. data/ext/sources/ggml/src/ggml-cuda/mean.cu +56 -2
  150. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  151. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  152. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  153. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  154. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  155. data/ext/sources/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  156. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  157. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  158. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  159. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  160. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  161. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  162. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  163. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  164. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  165. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  166. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  167. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  168. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  169. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  170. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  171. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  172. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  173. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  174. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  175. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  176. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  177. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -100
  178. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  179. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +21 -4
  180. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  181. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  182. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  183. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  184. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  185. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  186. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  187. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  188. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  189. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  190. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  191. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  192. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  193. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  195. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  196. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  197. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  198. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  199. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  200. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  201. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  202. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  203. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  204. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  205. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  206. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  208. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  209. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  210. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  211. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  212. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  213. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  214. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  234. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  235. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  236. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  237. data/ext/sources/ggml/src/ggml-cuda/unary.cu +90 -0
  238. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +8 -0
  239. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  240. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  241. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  242. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  243. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  244. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  245. data/ext/sources/ggml/src/ggml-impl.h +119 -9
  246. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -7
  247. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  248. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  249. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  250. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  251. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  252. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  253. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  254. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +136 -63
  255. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  256. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  257. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  258. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +2854 -1503
  259. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  260. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +18 -0
  261. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +2510 -242
  262. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  263. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  264. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  265. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  266. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  267. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  268. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  269. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  270. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  271. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  272. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +177 -0
  273. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  274. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  275. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  276. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  277. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  278. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  279. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  280. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  281. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  282. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  283. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  284. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  285. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  286. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  287. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  288. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  289. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  290. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  291. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  292. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  293. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  294. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  295. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  296. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  297. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  300. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  301. data/ext/sources/ggml/src/ggml-quants.c +111 -16
  302. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  303. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +67 -47
  304. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  305. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +15 -5
  306. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  307. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +25 -16
  308. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +10 -4
  309. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +166 -99
  310. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +72 -306
  311. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  312. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  313. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +1 -31
  314. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +79 -29
  315. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  316. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +14 -26
  317. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +9 -6
  318. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +328 -323
  319. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  320. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +2 -2
  321. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +80 -60
  322. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  323. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +74 -55
  324. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  325. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +8 -9
  326. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +35 -42
  327. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  328. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  329. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  330. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  331. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  332. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +16 -12
  333. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +3492 -883
  334. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  335. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  336. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  337. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  338. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  339. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  340. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  341. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  342. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  343. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  344. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  345. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  346. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  347. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  348. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  349. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  350. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  351. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  352. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  353. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  354. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  355. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  356. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  357. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  358. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  359. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  360. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  361. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  362. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  363. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  364. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  365. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +4 -0
  366. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  367. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  368. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  369. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  370. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  371. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  372. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  373. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  374. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  375. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +55 -11
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +335 -77
  401. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  402. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  403. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  404. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  405. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  406. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  407. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  408. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  409. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  410. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  411. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  412. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  413. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  414. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  415. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  416. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  417. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  418. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  419. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  420. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  421. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  422. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  423. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  424. data/ext/sources/ggml/src/ggml.c +478 -98
  425. data/ext/sources/ggml/src/gguf.cpp +8 -1
  426. data/ext/sources/src/whisper.cpp +23 -46
  427. data/ext/sources/tests/CMakeLists.txt +8 -1
  428. data/ext/sources/tests/test-vad-full.cpp +3 -3
  429. data/ext/sources/tests/test-vad.cpp +2 -2
  430. data/lib/whisper/model/uri.rb +1 -1
  431. data/sig/whisper.rbs +7 -0
  432. data/test/test_params.rb +8 -0
  433. data/test/test_whisper.rb +1 -1
  434. data/whispercpp.gemspec +1 -1
  435. metadata +164 -157
  436. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +0 -279
  437. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +0 -1841
  438. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +0 -303
  439. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  440. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  441. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  442. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  443. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  444. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  445. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  446. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  447. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  448. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  449. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  450. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  451. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  452. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  453. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  454. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  455. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  456. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  457. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  458. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  459. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  460. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  461. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  462. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  463. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  464. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  465. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  466. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  467. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  468. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  469. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  470. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  471. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  472. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  473. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  474. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  475. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  476. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  477. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  478. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  479. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  480. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  481. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  482. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  483. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  484. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  485. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  486. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  487. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  488. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  489. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  490. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  491. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  492. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  493. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  494. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  495. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  496. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  497. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  498. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  499. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  500. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  501. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  502. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  503. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  504. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  505. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  506. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  507. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  508. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  509. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  510. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  511. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  512. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  513. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  514. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  515. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  516. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  517. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  518. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  519. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  520. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  521. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  522. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  523. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  524. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  525. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  526. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  527. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  548. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  549. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  550. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  551. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  552. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  553. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  554. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  555. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  556. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  557. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  558. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  559. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  560. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  561. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  562. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  563. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  564. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  565. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  566. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  567. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  568. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  569. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  570. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  571. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  572. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  573. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  574. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  575. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  576. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  577. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  578. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  579. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  580. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  581. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  582. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  583. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  584. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  585. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  586. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -6280
@@ -55,7 +55,22 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
55
55
 
56
56
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
57
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+
59
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
59
74
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
60
75
  for (int i = 0; i < n; ++i) {
61
76
  z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
@@ -104,36 +119,149 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
104
119
  }
105
120
 
106
121
  #if defined(GGML_SIMD)
107
- const int np = (n & ~(GGML_F16_STEP - 1));
122
+ #if defined(__ARM_FEATURE_SVE)
123
+
124
+ const int sve_register_length = svcntb() * 8;
125
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
126
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
127
+
128
+ const int np = (n & ~(ggml_f16_step - 1));
129
+
130
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
131
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
132
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
133
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
134
+
135
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
136
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
137
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
138
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
139
+
140
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
141
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
142
+
143
+ for (int i = 0; i < np; i += ggml_f16_step) {
144
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145
+
146
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150
+
151
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152
+
153
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155
+ ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
157
+
158
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
159
+
160
+ ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
164
+
165
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
166
+
167
+ ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
168
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
169
+ ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
170
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
171
+
172
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
173
+
174
+ ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
175
+
176
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
177
+ ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
178
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
179
+
180
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
108
181
 
109
- GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
182
+ ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
110
183
 
111
- GGML_F16_VEC ax[GGML_F16_ARR];
112
- GGML_F16_VEC ay[GGML_F16_ARR];
184
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
185
+ ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
186
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
113
187
 
114
- for (int i = 0; i < np; i += GGML_F16_STEP) {
115
- for (int j = 0; j < GGML_F16_ARR; j++) {
116
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
188
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
117
189
 
118
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
119
- ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
190
+ ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
120
191
 
121
- sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
192
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
193
+ ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
194
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
195
+
196
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
197
+
198
+ ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
199
+
200
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
201
+ ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
202
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
203
+ }
204
+
205
+ const int np2 = (n & ~(ggml_f16_epr - 1));
206
+ for (int k = np; k < np2; k += ggml_f16_epr) {
207
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
208
+
209
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
210
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
211
+ rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
212
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
213
+ }
214
+
215
+ if (np2 < n) {
216
+ svbool_t pg = svwhilelt_b16(np2, n);
217
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
218
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
219
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
220
+
221
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
222
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
223
+ }
224
+ GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
225
+ GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
226
+ #elif defined(__riscv_v_intrinsic)
227
+ // todo: RVV impl
228
+ for (int i = 0; i < n; ++i) {
229
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
230
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
231
+ }
232
+ }
233
+ #else
234
+ const int np = (n & ~(GGML_F16_STEP - 1));
235
+
236
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
237
+
238
+ GGML_F16_VEC ax[GGML_F16_ARR];
239
+ GGML_F16_VEC ay[GGML_F16_ARR];
240
+
241
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
242
+ for (int j = 0; j < GGML_F16_ARR; j++) {
243
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
244
+
245
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
246
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
247
+
248
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
249
+ }
122
250
  }
123
251
  }
124
- }
125
252
 
126
- // reduce sum0..sum3 to sum0
127
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
128
- GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
129
- }
253
+ // reduce sum0..sum3 to sum0
254
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
255
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
256
+ }
130
257
 
131
- // leftovers
132
- for (int i = np; i < n; ++i) {
133
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
134
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
258
+ // leftovers
259
+ for (int i = np; i < n; ++i) {
260
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
261
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
262
+ }
135
263
  }
136
- }
264
+ #endif
137
265
  #else
138
266
  for (int i = 0; i < n; ++i) {
139
267
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -163,49 +291,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
163
291
 
164
292
  ax1 = GGML_F32_VEC_LOAD(x + i);
165
293
  ay1 = GGML_F32_VEC_LOAD(y + i);
166
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
294
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
167
295
 
168
296
  GGML_F32_VEC_STORE(y + i, ay1);
169
297
 
170
298
  ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
299
  ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
- ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
300
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
173
301
 
174
302
  GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
303
 
176
304
  ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
305
  ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
- ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
306
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
179
307
 
180
308
  GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
309
 
182
310
  ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
311
  ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
- ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
312
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
185
313
 
186
314
  GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
315
 
188
316
  ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
317
  ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
- ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
318
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
191
319
 
192
320
  GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
321
 
194
322
  ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
323
  ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
- ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
324
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
197
325
 
198
326
  GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
327
 
200
328
  ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
329
  ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
- ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
330
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
203
331
 
204
332
  GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
333
 
206
334
  ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
335
  ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
- ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
336
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
209
337
 
210
338
  GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
211
339
  }
@@ -215,7 +343,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
215
343
  for (int i = np; i < np2; i += ggml_f32_epr) {
216
344
  ax1 = GGML_F32_VEC_LOAD(x + i);
217
345
  ay1 = GGML_F32_VEC_LOAD(y + i);
218
- ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
346
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
219
347
 
220
348
  GGML_F32_VEC_STORE(y + i, ay1);
221
349
  }
@@ -228,6 +356,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
228
356
 
229
357
  svst1_f32(pg, y + np2, ay1);
230
358
  }
359
+ #elif defined(__riscv_v_intrinsic)
360
+ for (int i = 0, avl; i < n; i += avl) {
361
+ avl = __riscv_vsetvl_e32m8(n - i);
362
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
363
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
364
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
365
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
366
+ }
231
367
  #else
232
368
  const int np = (n & ~(GGML_F32_STEP - 1));
233
369
 
@@ -261,27 +397,112 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
261
397
 
262
398
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
263
399
  #if defined(GGML_SIMD)
264
- const int np = (n & ~(GGML_F16_STEP - 1));
400
+ #if defined(__ARM_FEATURE_SVE)
401
+ const int sve_register_length = svcntb() * 8;
402
+ const int ggml_f16_epr = sve_register_length / 16;
403
+ const int ggml_f16_step = 8 * ggml_f16_epr;
404
+
405
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
406
+
407
+ const int np= (n & ~(ggml_f16_step - 1));
408
+
409
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
+ for (int i = 0; i < np; i += ggml_f16_step) {
412
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
415
+
416
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
417
+
418
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
421
+
422
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
265
423
 
266
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
424
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
267
427
 
268
- GGML_F16_VEC ax[GGML_F16_ARR];
269
- GGML_F16_VEC ay[GGML_F16_ARR];
428
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
270
429
 
271
- for (int i = 0; i < np; i += GGML_F16_STEP) {
272
- for (int j = 0; j < GGML_F16_ARR; j++) {
273
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
274
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
275
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
430
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
276
433
 
277
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
434
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
435
+
436
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
439
+
440
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
441
+
442
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
445
+
446
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
447
+
448
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
451
+
452
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
453
+
454
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
457
+
458
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
278
459
  }
279
- }
460
+ const int np2 = (n & ~(ggml_f16_epr - 1));
461
+ for (int k = np; k < np2; k += ggml_f16_epr) {
462
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
280
465
 
281
- // leftovers
282
- for (int i = np; i < n; ++i) {
283
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
284
- }
466
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
467
+ }
468
+
469
+ if (np2 < n) {
470
+ svbool_t pg = svwhilelt_b16(np2, n);
471
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
+ hy = svmad_f16_x(pg, hx, vx, hy);
474
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
+ }
476
+
477
+ #elif defined(__riscv_v_intrinsic)
478
+ // todo: RVV impl
479
+ // scalar
480
+ for (int i = 0; i < n; ++i) {
481
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
482
+ }
483
+ #else
484
+ const int np = (n & ~(GGML_F16_STEP - 1));
485
+
486
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
487
+
488
+ GGML_F16_VEC ax[GGML_F16_ARR];
489
+ GGML_F16_VEC ay[GGML_F16_ARR];
490
+
491
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
492
+ for (int j = 0; j < GGML_F16_ARR; j++) {
493
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
494
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
495
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
496
+
497
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
498
+ }
499
+ }
500
+
501
+ // leftovers
502
+ for (int i = np; i < n; ++i) {
503
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
504
+ }
505
+ #endif
285
506
  #else
286
507
  // scalar
287
508
  for (int i = 0; i < n; ++i) {
@@ -309,6 +530,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
309
530
  y[i] += x[k][i]*v[k][0];
310
531
  }
311
532
  }
533
+ #elif defined(__riscv_v_intrinsic)
534
+ for (int i = 0, avl; i < n; i += avl) {
535
+ avl = __riscv_vsetvl_e32m8(n - i);
536
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
537
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
538
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
539
+ ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
540
+ }
541
+ __riscv_vse32_v_f32m8(&y[i], ay, avl);
542
+ }
312
543
  #else
313
544
  const int np = (n & ~(GGML_F32_STEP - 1));
314
545
 
@@ -351,6 +582,53 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
351
582
  #endif
352
583
  }
353
584
 
585
+ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
586
+ #if defined(GGML_USE_ACCELERATE)
587
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
588
+ #elif defined(GGML_SIMD)
589
+ #if defined(__ARM_FEATURE_SVE)
590
+ // scalar ; TODO: Write SVE code
591
+ for (int i = 0; i < n; ++i) {
592
+ y[i] = x[i]*s + b;
593
+ }
594
+ #elif defined(__riscv_v_intrinsic)
595
+ for (int i = 0, avl; i < n; i += avl) {
596
+ avl = __riscv_vsetvl_e32m8(n - i);
597
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
598
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
599
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
600
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
601
+ }
602
+ #else
603
+ const int np = (n & ~(GGML_F32_STEP - 1));
604
+
605
+ GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
606
+ GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
607
+
608
+ GGML_F32_VEC ay[GGML_F32_ARR];
609
+
610
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
611
+ for (int j = 0; j < GGML_F32_ARR; j++) {
612
+ ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
613
+ ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
614
+
615
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
616
+ }
617
+ }
618
+
619
+ // leftovers
620
+ for (int i = np; i < n; ++i) {
621
+ y[i] = x[i]*s + b;
622
+ }
623
+ #endif
624
+ #else
625
+ // scalar
626
+ for (int i = 0; i < n; ++i) {
627
+ y[i] = x[i]*s + b;
628
+ }
629
+ #endif
630
+ }
631
+
354
632
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
355
633
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
356
634
  #if defined(GGML_USE_ACCELERATE)
@@ -382,6 +660,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
382
660
  ay1 = svmul_f32_m(pg, ay1, vx);
383
661
  svst1_f32(pg, y + np, ay1);
384
662
  }
663
+ #elif defined(__riscv_v_intrinsic)
664
+ for (int i = 0, avl; i < n; i += avl) {
665
+ avl = __riscv_vsetvl_e32m8(n - i);
666
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
667
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
668
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
669
+ }
385
670
  #else
386
671
  const int np = (n & ~(GGML_F32_STEP - 1));
387
672
 
@@ -413,25 +698,59 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
413
698
 
414
699
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
415
700
  #if defined(GGML_SIMD)
416
- const int np = (n & ~(GGML_F16_STEP - 1));
701
+ #if defined(__ARM_FEATURE_SVE)
702
+ const int sve_register_length = svcntb() * 8;
703
+ const int ggml_f16_epr = sve_register_length / 16;
704
+ const int ggml_f16_step = 2 * ggml_f16_epr;
705
+
706
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707
+ const int np = (n & ~(ggml_f16_step - 1));
708
+ svfloat16_t ay1, ay2;
709
+
710
+ for (int i = 0; i < np; i += ggml_f16_step) {
711
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714
+
715
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718
+ }
719
+ // leftovers
720
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721
+ if (np < n) {
722
+ svbool_t pg = svwhilelt_b16(np, n);
723
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
725
+ svst1_f16(pg, (__fp16 *)(y + np), out);
726
+ }
727
+ #elif defined(__riscv_v_intrinsic)
728
+ // todo: RVV impl
729
+ // scalar
730
+ for (int i = 0; i < n; ++i) {
731
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
732
+ }
733
+ #else
734
+ const int np = (n & ~(GGML_F16_STEP - 1));
417
735
 
418
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
736
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
419
737
 
420
- GGML_F16_VEC ay[GGML_F16_ARR];
738
+ GGML_F16_VEC ay[GGML_F16_ARR];
421
739
 
422
- for (int i = 0; i < np; i += GGML_F16_STEP) {
423
- for (int j = 0; j < GGML_F16_ARR; j++) {
424
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
425
- ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
740
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
741
+ for (int j = 0; j < GGML_F16_ARR; j++) {
742
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
743
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
426
744
 
427
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
745
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
746
+ }
428
747
  }
429
- }
430
748
 
431
- // leftovers
432
- for (int i = np; i < n; ++i) {
433
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
434
- }
749
+ // leftovers
750
+ for (int i = np; i < n; ++i) {
751
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
752
+ }
753
+ #endif
435
754
  #else
436
755
  // scalar
437
756
  for (int i = 0; i < n; ++i) {
@@ -683,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
683
1002
  }
684
1003
  #endif
685
1004
 
686
- #if defined(__ARM_NEON) && defined(__aarch64__)
1005
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1006
+
1007
+ inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1008
+ const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1009
+ const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1010
+ const svfloat32_t n = svsub_f32_x(pg, z, r);
1011
+ const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1012
+ const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1013
+ const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1014
+ const svbool_t c = svacgt_n_f32(pg, n, 126);
1015
+ const svfloat32_t u = svmul_f32_x(pg, b, b);
1016
+ const svfloat32_t j = svmla_f32_x(pg,
1017
+ svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1018
+ svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1019
+ svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1020
+ const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1021
+ const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1022
+ const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1023
+ return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1024
+ svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1025
+ }
1026
+
1027
+ // computes silu x/(1+exp(-x)) in single precision vector
1028
+ inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1029
+ const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1030
+ const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1031
+ const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1032
+ const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1033
+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1034
+ return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1035
+ }
1036
+
1037
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
687
1038
 
688
1039
  // adapted from arm limited optimized routine
689
1040
  // the maximum error is 1.45358 plus 0.5 ulps
@@ -874,7 +1225,59 @@ inline static __m128 ggml_v_silu(__m128 x) {
874
1225
  return _mm_div_ps(x, one_plus_exp_neg_x);
875
1226
  }
876
1227
 
877
- #endif // __ARM_NEON / __AVX2__ / __SSE2__
1228
+ #elif defined(__riscv_v_intrinsic)
1229
+
1230
+ // adapted from arm limited optimized routine
1231
+ // the maximum error is 1.45358 plus 0.5 ulps
1232
+ // numbers above 88.38 will flush to infinity
1233
+ // numbers beneath -103.97 will flush to zero
1234
+ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1235
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1236
+ #ifdef __riscv_xtheadvector
1237
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1238
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1239
+ z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1240
+ #else
1241
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1242
+ #endif
1243
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1244
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1245
+ 0x1.7f7d1cp-20f, n, vl);
1246
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1247
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1248
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1249
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1250
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1251
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1252
+ __riscv_vfmacc_vv_f32m2(
1253
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1254
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1255
+ u, vl), u, vl);
1256
+ if (!__riscv_vcpop_m_b16(c, vl))
1257
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1258
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1259
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1260
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1261
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1262
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1263
+ __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1264
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1265
+ c, vl);
1266
+ return __riscv_vmerge_vvm_f32m2(
1267
+ r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1268
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1269
+ vl);
1270
+ }
1271
+
1272
+ // computes silu x/(1+exp(-x)) in single precision vector
1273
+ inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1274
+ const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1275
+ const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1276
+ const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1277
+ return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1278
+ }
1279
+
1280
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
878
1281
 
879
1282
  inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
880
1283
  for (int i = 0; i < n; ++i) {
@@ -953,9 +1356,49 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
953
1356
 
954
1357
  inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
955
1358
  for (int i = 0; i < n; ++i) {
956
- float v = GGML_CPU_FP16_TO_FP32(x[i]);
957
- float w = GGML_CPU_FP16_TO_FP32(g[i]);
958
- y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
1359
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1360
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1361
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1362
+ }
1363
+ }
1364
+
1365
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1366
+ for (int i = 0; i < n; ++i) {
1367
+ float xi = x[i];
1368
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1369
+ }
1370
+ }
1371
+
1372
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1373
+ for (int i = 0; i < n; ++i) {
1374
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1375
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1376
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1377
+ }
1378
+ }
1379
+
1380
+ #ifdef GGML_GELU_QUICK_FP16
1381
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1382
+ uint16_t t;
1383
+ for (int i = 0; i < n; ++i) {
1384
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1385
+ memcpy(&t, &fp16, sizeof(uint16_t));
1386
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1387
+ }
1388
+ }
1389
+ #else
1390
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1391
+ for (int i = 0; i < n; ++i) {
1392
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1393
+ }
1394
+ }
1395
+ #endif
1396
+
1397
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1398
+ const uint16_t * i16 = (const uint16_t *) x;
1399
+ for (int i = 0; i < n; ++i) {
1400
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1401
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
959
1402
  }
960
1403
  }
961
1404