whispercpp 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (664) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +59 -27
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/build-xcframework.sh +24 -0
  19. data/ext/sources/examples/CMakeLists.txt +1 -0
  20. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  21. data/ext/sources/examples/addon.node/addon.cpp +154 -35
  22. data/ext/sources/examples/addon.node/index.js +10 -5
  23. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  24. data/ext/sources/examples/bench/bench.cpp +29 -18
  25. data/ext/sources/examples/bench.wasm/index-tmpl.html +10 -9
  26. data/ext/sources/examples/cli/cli.cpp +7 -4
  27. data/ext/sources/examples/command/command.cpp +58 -32
  28. data/ext/sources/examples/command.wasm/index-tmpl.html +5 -4
  29. data/ext/sources/examples/common-ggml.cpp +2 -0
  30. data/ext/sources/examples/common-whisper.cpp +14 -7
  31. data/ext/sources/examples/lsp/lsp.cpp +21 -17
  32. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  33. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  34. data/ext/sources/examples/server/server.cpp +193 -35
  35. data/ext/sources/examples/server.py +6 -1
  36. data/ext/sources/examples/stream/stream.cpp +10 -2
  37. data/ext/sources/examples/stream.wasm/emscripten.cpp +6 -6
  38. data/ext/sources/examples/stream.wasm/index-tmpl.html +82 -5
  39. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -0
  40. data/ext/sources/examples/talk-llama/llama-adapter.cpp +101 -4
  41. data/ext/sources/examples/talk-llama/llama-adapter.h +6 -0
  42. data/ext/sources/examples/talk-llama/llama-arch.cpp +756 -15
  43. data/ext/sources/examples/talk-llama/llama-arch.h +85 -1
  44. data/ext/sources/examples/talk-llama/llama-batch.cpp +773 -272
  45. data/ext/sources/examples/talk-llama/llama-batch.h +126 -55
  46. data/ext/sources/examples/talk-llama/llama-chat.cpp +150 -13
  47. data/ext/sources/examples/talk-llama/llama-chat.h +8 -0
  48. data/ext/sources/examples/talk-llama/llama-context.cpp +814 -542
  49. data/ext/sources/examples/talk-llama/llama-context.h +68 -32
  50. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  51. data/ext/sources/examples/talk-llama/llama-cparams.h +4 -4
  52. data/ext/sources/examples/talk-llama/llama-graph.cpp +787 -440
  53. data/ext/sources/examples/talk-llama/llama-graph.h +333 -153
  54. data/ext/sources/examples/talk-llama/llama-hparams.cpp +128 -6
  55. data/ext/sources/examples/talk-llama/llama-hparams.h +80 -17
  56. data/ext/sources/examples/talk-llama/llama-impl.h +2 -0
  57. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +326 -0
  58. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.h +137 -0
  59. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +1248 -1967
  60. data/ext/sources/examples/talk-llama/llama-kv-cache.h +218 -345
  61. data/ext/sources/examples/talk-llama/llama-kv-cells.h +164 -52
  62. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +266 -0
  63. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +139 -0
  64. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1154 -0
  65. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +182 -0
  66. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  67. data/ext/sources/examples/talk-llama/llama-memory.h +94 -4
  68. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  69. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +44 -17
  70. data/ext/sources/examples/talk-llama/llama-model-loader.h +3 -2
  71. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  72. data/ext/sources/examples/talk-llama/llama-model.cpp +11377 -5248
  73. data/ext/sources/examples/talk-llama/llama-model.h +87 -9
  74. data/ext/sources/examples/talk-llama/llama-quant.cpp +137 -16
  75. data/ext/sources/examples/talk-llama/llama-sampling.cpp +226 -126
  76. data/ext/sources/examples/talk-llama/llama-vocab.cpp +502 -38
  77. data/ext/sources/examples/talk-llama/llama-vocab.h +46 -0
  78. data/ext/sources/examples/talk-llama/llama.cpp +76 -17
  79. data/ext/sources/examples/talk-llama/llama.h +176 -151
  80. data/ext/sources/examples/talk-llama/talk-llama.cpp +11 -6
  81. data/ext/sources/examples/talk-llama/unicode.cpp +212 -0
  82. data/ext/sources/examples/talk-llama/unicode.h +45 -0
  83. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  84. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +6 -2
  85. data/ext/sources/examples/whisper.wasm/index-tmpl.html +17 -16
  86. data/ext/sources/ggml/CMakeLists.txt +106 -33
  87. data/ext/sources/ggml/cmake/common.cmake +24 -0
  88. data/ext/sources/ggml/cmake/ggml-config.cmake.in +132 -93
  89. data/ext/sources/ggml/include/ggml-backend.h +18 -2
  90. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  91. data/ext/sources/ggml/include/ggml-metal.h +1 -6
  92. data/ext/sources/ggml/include/ggml-opt.h +25 -6
  93. data/ext/sources/ggml/include/ggml-webgpu.h +19 -0
  94. data/ext/sources/ggml/include/ggml-zdnn.h +17 -0
  95. data/ext/sources/ggml/include/ggml.h +365 -21
  96. data/ext/sources/ggml/src/CMakeLists.txt +98 -25
  97. data/ext/sources/ggml/src/ggml-alloc.c +265 -141
  98. data/ext/sources/ggml/src/ggml-backend-impl.h +4 -1
  99. data/ext/sources/ggml/src/ggml-backend-reg.cpp +35 -13
  100. data/ext/sources/ggml/src/ggml-backend.cpp +266 -60
  101. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +4 -4
  102. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +5 -4
  103. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +15 -0
  104. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  105. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +903 -717
  106. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +143 -25
  107. data/ext/sources/ggml/src/ggml-cann/common.h +149 -2
  108. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +521 -78
  109. data/ext/sources/ggml/src/ggml-common.h +21 -0
  110. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +165 -50
  111. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +5 -3
  112. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  113. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  114. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +3650 -0
  115. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +1891 -0
  116. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2160 -0
  117. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  118. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  119. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1897 -0
  120. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  121. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  122. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  123. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  124. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  125. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +214 -0
  126. data/ext/sources/ggml/src/ggml-cpu/common.h +18 -3
  127. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +23 -7
  128. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +179 -110
  129. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +44 -33
  130. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  131. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +152 -18
  132. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.h +7 -1
  133. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +228 -98
  134. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +532 -1124
  135. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  136. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +3374 -2081
  137. data/ext/sources/ggml/src/ggml-cpu/ops.h +13 -8
  138. data/ext/sources/ggml/src/ggml-cpu/quants.c +1193 -0
  139. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +34 -0
  140. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1982 -0
  141. data/ext/sources/ggml/src/ggml-cpu/repack.h +120 -0
  142. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +367 -46
  143. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  144. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  145. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  146. data/ext/sources/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  147. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +3 -3
  148. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +1 -1
  149. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +272 -35
  150. data/ext/sources/ggml/src/ggml-cpu/vec.h +794 -142
  151. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +20 -16
  152. data/ext/sources/ggml/src/ggml-cuda/add-id.cu +58 -0
  153. data/ext/sources/ggml/src/ggml-cuda/add-id.cuh +3 -0
  154. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +330 -191
  155. data/ext/sources/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  156. data/ext/sources/ggml/src/ggml-cuda/common.cuh +291 -81
  157. data/ext/sources/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  158. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  159. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  160. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  161. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  162. data/ext/sources/ggml/src/ggml-cuda/conv2d.cu +166 -0
  163. data/ext/sources/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  164. data/ext/sources/ggml/src/ggml-cuda/convert.cu +117 -22
  165. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +20 -0
  166. data/ext/sources/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  167. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +64 -307
  168. data/ext/sources/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  169. data/ext/sources/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  170. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +499 -368
  171. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +142 -93
  172. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cu +755 -0
  173. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +3 -0
  174. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +593 -0
  175. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +90 -50
  176. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +185 -198
  177. data/ext/sources/ggml/src/ggml-cuda/fattn.cuh +2 -0
  178. data/ext/sources/ggml/src/ggml-cuda/getrows.cu +50 -39
  179. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +636 -222
  180. data/ext/sources/ggml/src/ggml-cuda/im2col.cu +196 -35
  181. data/ext/sources/ggml/src/ggml-cuda/im2col.cuh +1 -0
  182. data/ext/sources/ggml/src/ggml-cuda/mean.cu +73 -0
  183. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  184. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +198 -45
  185. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +123 -0
  186. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +496 -0
  187. data/ext/sources/ggml/src/ggml-cuda/mmq.cu +206 -57
  188. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +1262 -721
  189. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +506 -0
  190. data/ext/sources/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +4 -5
  191. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +64 -73
  192. data/ext/sources/ggml/src/ggml-cuda/norm.cu +284 -12
  193. data/ext/sources/ggml/src/ggml-cuda/norm.cuh +7 -0
  194. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  195. data/ext/sources/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  196. data/ext/sources/ggml/src/ggml-cuda/pad.cu +46 -23
  197. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  198. data/ext/sources/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  199. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +12 -10
  200. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  201. data/ext/sources/ggml/src/ggml-cuda/roll.cu +67 -0
  202. data/ext/sources/ggml/src/ggml-cuda/roll.cuh +5 -0
  203. data/ext/sources/ggml/src/ggml-cuda/rope.cu +21 -27
  204. data/ext/sources/ggml/src/ggml-cuda/scale.cu +14 -11
  205. data/ext/sources/ggml/src/ggml-cuda/set-rows.cu +276 -0
  206. data/ext/sources/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  207. data/ext/sources/ggml/src/ggml-cuda/softcap.cu +34 -0
  208. data/ext/sources/ggml/src/ggml-cuda/softcap.cuh +5 -0
  209. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +126 -59
  210. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  211. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +322 -98
  212. data/ext/sources/ggml/src/ggml-cuda/sum.cu +6 -10
  213. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +23 -19
  214. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  215. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  216. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  217. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  218. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  219. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  220. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  221. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  222. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  223. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  224. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  225. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  226. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  227. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  228. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  229. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  230. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  231. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  232. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  233. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  234. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  235. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  236. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  237. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  238. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  239. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  240. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  241. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  242. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  243. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  244. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  245. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  246. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  247. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  248. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  249. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  250. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  251. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +21 -18
  252. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  253. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  254. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  255. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  256. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  257. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  258. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  259. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  260. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  261. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  262. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  269. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +259 -0
  270. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +14 -0
  271. data/ext/sources/ggml/src/ggml-cuda/tsembd.cu +3 -3
  272. data/ext/sources/ggml/src/ggml-cuda/unary.cu +179 -0
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +15 -0
  274. data/ext/sources/ggml/src/ggml-cuda/upscale.cu +92 -6
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  277. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +58 -36
  278. data/ext/sources/ggml/src/ggml-cuda/vendors/musa.h +4 -3
  279. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +14 -2
  280. data/ext/sources/ggml/src/ggml-impl.h +229 -175
  281. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +21 -17
  282. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  283. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  284. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  285. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +600 -0
  286. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +1376 -0
  287. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +226 -0
  288. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +1308 -0
  289. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +163 -63
  290. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +3158 -0
  291. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +82 -0
  292. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +718 -0
  293. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +3208 -1575
  294. data/ext/sources/ggml/src/ggml-musa/CMakeLists.txt +18 -8
  295. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  296. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +32 -0
  297. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +4430 -792
  298. data/ext/sources/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  299. data/ext/sources/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  300. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  301. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  302. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  303. data/ext/sources/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  304. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +84 -0
  305. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  306. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  307. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +370 -0
  308. data/ext/sources/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  309. data/ext/sources/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  310. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  311. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  312. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  313. data/ext/sources/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  314. data/ext/sources/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  315. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  316. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  317. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  318. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  319. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  320. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  321. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  322. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  323. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  324. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  325. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  326. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  327. data/ext/sources/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  328. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  329. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  330. data/ext/sources/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  331. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  332. data/ext/sources/ggml/src/ggml-opencl/kernels/set_rows.cl +189 -0
  333. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  334. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +34 -13
  335. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +34 -13
  336. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f16.cl +34 -13
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/softmax_f32.cl +34 -13
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  344. data/ext/sources/ggml/src/ggml-opt.cpp +97 -41
  345. data/ext/sources/ggml/src/ggml-quants.c +117 -24
  346. data/ext/sources/ggml/src/ggml-quants.h +6 -0
  347. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +85 -62
  348. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  349. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +2 -0
  350. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +9 -0
  351. data/ext/sources/ggml/src/ggml-sycl/binbcast.hpp +6 -0
  352. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  353. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +13 -17
  354. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +21 -2
  355. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +116 -211
  356. data/ext/sources/ggml/src/ggml-sycl/cpy.hpp +213 -1
  357. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  358. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +700 -1041
  359. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +20 -9
  360. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +17 -26
  361. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +2 -96
  362. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +393 -250
  363. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  364. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  365. data/ext/sources/ggml/src/ggml-sycl/quantize.hpp +133 -0
  366. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -11
  367. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +125 -21
  368. data/ext/sources/ggml/src/ggml-sycl/set_rows.cpp +234 -0
  369. data/ext/sources/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  370. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  371. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  372. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +4 -3
  373. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +105 -17
  374. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  375. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4198 -1145
  376. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  377. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  378. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  379. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  380. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  381. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +349 -0
  382. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  383. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  384. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +66 -12
  385. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +154 -0
  386. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  387. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +1 -1
  388. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +2 -1
  389. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +6 -5
  390. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +4 -2
  391. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  392. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  393. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  394. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  395. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  396. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  397. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +21 -0
  398. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +69 -24
  399. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +60 -20
  400. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +98 -42
  401. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +64 -27
  402. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +74 -13
  403. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  404. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  405. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  406. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  407. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +4 -17
  408. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +19 -10
  409. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +25 -15
  410. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +19 -0
  411. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  412. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  413. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  414. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +18 -14
  415. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +126 -0
  416. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +65 -1
  417. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  418. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  419. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +144 -531
  420. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +206 -38
  421. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp +556 -0
  422. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +12 -5
  423. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +15 -9
  424. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  425. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  426. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +24 -3
  427. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +53 -3
  428. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  429. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +64 -11
  430. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  431. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  432. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  433. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  434. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  435. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  436. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  437. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  438. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +29 -7
  439. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +4 -0
  440. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  441. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  442. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  443. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  444. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +4 -3
  445. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +101 -9
  446. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  447. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  448. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +338 -71
  449. data/ext/sources/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  450. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1558 -0
  451. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add.tmpl.wgsl +44 -0
  452. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/add_in_place.tmpl.wgsl +41 -0
  453. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  454. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  455. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  456. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +124 -0
  457. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  458. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  459. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul.tmpl.wgsl +44 -0
  460. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_in_place.tmpl.wgsl +41 -0
  461. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +907 -0
  462. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +57 -0
  463. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_in_place.wgsl +48 -0
  464. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +81 -0
  465. data/ext/sources/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  466. data/ext/sources/ggml/src/ggml-zdnn/common.hpp +59 -0
  467. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +628 -0
  468. data/ext/sources/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  469. data/ext/sources/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  470. data/ext/sources/ggml/src/ggml-zdnn/utils.cpp +79 -0
  471. data/ext/sources/ggml/src/ggml-zdnn/utils.hpp +19 -0
  472. data/ext/sources/ggml/src/ggml.c +802 -142
  473. data/ext/sources/ggml/src/ggml.cpp +26 -0
  474. data/ext/sources/ggml/src/gguf.cpp +32 -4
  475. data/ext/sources/include/whisper.h +2 -0
  476. data/ext/sources/src/CMakeLists.txt +2 -0
  477. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  478. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  479. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  480. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  481. data/ext/sources/src/whisper.cpp +241 -215
  482. data/ext/sources/tests/CMakeLists.txt +8 -1
  483. data/ext/sources/tests/test-vad-full.cpp +3 -3
  484. data/ext/sources/tests/test-vad.cpp +2 -2
  485. data/extsources.rb +15 -9
  486. data/lib/whisper/context.rb +15 -0
  487. data/lib/whisper/model/uri.rb +57 -2
  488. data/lib/whisper/segment.rb +58 -0
  489. data/sig/whisper.rbs +75 -38
  490. data/{tests → test}/helper.rb +1 -12
  491. data/{tests → test}/test_model.rb +9 -0
  492. data/test/test_package.rb +51 -0
  493. data/{tests → test}/test_params.rb +8 -0
  494. data/test/test_segment.rb +146 -0
  495. data/{tests → test}/test_whisper.rb +70 -0
  496. data/whispercpp.gemspec +2 -3
  497. metadata +246 -191
  498. data/ext/sources/.dockerignore +0 -3
  499. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  500. data/ext/sources/ci/run.sh +0 -336
  501. data/ext/sources/close-issue.yml +0 -28
  502. data/ext/sources/ggml/include/ggml-kompute.h +0 -50
  503. data/ext/sources/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  504. data/ext/sources/ggml/src/ggml-amx/common.h +0 -94
  505. data/ext/sources/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
  506. data/ext/sources/ggml/src/ggml-amx/mmq.cpp +0 -2510
  507. data/ext/sources/ggml/src/ggml-amx/mmq.h +0 -17
  508. data/ext/sources/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  509. data/ext/sources/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  510. data/ext/sources/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  511. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  512. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  513. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  514. data/ext/sources/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  515. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  516. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  517. data/ext/sources/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  518. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -6431
  519. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  520. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  521. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -357
  522. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  523. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -365
  524. data/ext/sources/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  525. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -482
  526. data/ext/sources/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -472
  527. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +0 -336
  528. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  529. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  530. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  531. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  532. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  533. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  534. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  535. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  536. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  537. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  538. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  539. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  540. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  541. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  542. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  543. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  544. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  545. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  546. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  547. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  548. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  549. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  550. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  551. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  552. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  553. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  554. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  555. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  556. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  557. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  558. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  559. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  560. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  561. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  562. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  563. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  564. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  565. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  566. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  567. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  568. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  569. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  570. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  571. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  572. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  573. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  574. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  575. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  576. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  577. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  578. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  579. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  580. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  581. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  582. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  583. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  584. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  585. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  586. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  587. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  588. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  589. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  590. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  591. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  592. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  593. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  594. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  595. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  596. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  597. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  598. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  599. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  600. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  601. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  602. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  603. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  604. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  605. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  606. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  607. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  608. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  609. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  610. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  611. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  612. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  613. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  614. data/ext/sources/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  615. data/ext/sources/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  616. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  617. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  618. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  619. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  620. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  621. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  622. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  623. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  624. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  625. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  626. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  627. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  628. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  629. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  630. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  631. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  632. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  633. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  634. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  635. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  636. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  637. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  638. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  639. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  640. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  641. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  642. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  643. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  644. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  645. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  646. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  647. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  648. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  649. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  650. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  651. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  652. data/ext/sources/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
  653. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +0 -5998
  654. data/tests/test_package.rb +0 -46
  655. data/tests/test_segment.rb +0 -74
  656. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  657. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  658. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  659. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  660. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  661. /data/{tests → test}/test_callback.rb +0 -0
  662. /data/{tests → test}/test_error.rb +0 -0
  663. /data/{tests → test}/test_vad.rb +0 -0
  664. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -5,6 +5,7 @@
5
5
  #include "ggml-impl.h"
6
6
  #include "simd-mappings.h"
7
7
  #include "ggml.h"
8
+ #include "ggml-cpu.h"
8
9
 
9
10
  #if defined(GGML_USE_ACCELERATE)
10
11
  #include <Accelerate/Accelerate.h>
@@ -54,10 +55,25 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
54
55
 
55
56
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
56
57
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
- inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+
59
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
60
+ int i = 0;
61
+ #if defined(__AVX2__)
62
+ for (; i + 7 < n; i += 8) {
63
+ __m256 vx = _mm256_loadu_ps(x + i);
64
+ __m256 vy = _mm256_loadu_ps(y + i);
65
+ __m256 vz = _mm256_add_ps(vx, vy);
66
+ _mm256_storeu_ps(z + i, vz);
67
+ }
68
+ #endif
69
+ for (; i < n; ++i) {
70
+ z[i] = x[i] + y[i];
71
+ }
72
+ }
73
+
58
74
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
59
75
  for (int i = 0; i < n; ++i) {
60
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
76
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
61
77
  }
62
78
  }
63
79
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
@@ -66,7 +82,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
66
82
  inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
67
83
  inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
68
84
  for (int i = 0; i < n; ++i) {
69
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
85
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
70
86
  }
71
87
  }
72
88
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
@@ -74,20 +90,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
74
90
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
75
91
  inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
76
92
  for (int i = 0; i < n; ++i) {
77
- y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
93
+ y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
78
94
  }
79
95
  }
80
96
 
81
97
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
82
98
  inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
83
99
  for (int i = 0; i < n; ++i) {
84
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
100
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
85
101
  }
86
102
  }
87
103
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
88
104
  inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
89
105
  for (int i = 0; i < n; ++i) {
90
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
106
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
91
107
  }
92
108
  }
93
109
 
@@ -103,40 +119,153 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
103
119
  }
104
120
 
105
121
  #if defined(GGML_SIMD)
106
- const int np = (n & ~(GGML_F16_STEP - 1));
122
+ #if defined(__ARM_FEATURE_SVE)
123
+
124
+ const int sve_register_length = svcntb() * 8;
125
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
126
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
127
+
128
+ const int np = (n & ~(ggml_f16_step - 1));
129
+
130
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
131
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
132
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
133
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
134
+
135
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
136
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
137
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
138
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
139
+
140
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
141
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
142
+
143
+ for (int i = 0; i < np; i += ggml_f16_step) {
144
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145
+
146
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150
+
151
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152
+
153
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155
+ ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
107
157
 
108
- GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
158
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
109
159
 
110
- GGML_F16_VEC ax[GGML_F16_ARR];
111
- GGML_F16_VEC ay[GGML_F16_ARR];
160
+ ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
112
164
 
113
- for (int i = 0; i < np; i += GGML_F16_STEP) {
114
- for (int j = 0; j < GGML_F16_ARR; j++) {
115
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
165
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
116
166
 
117
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
118
- ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
167
+ ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
168
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
169
+ ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
170
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
119
171
 
120
- sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
172
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
173
+
174
+ ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
175
+
176
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
177
+ ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
178
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
179
+
180
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
181
+
182
+ ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
183
+
184
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
185
+ ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
186
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
187
+
188
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
189
+
190
+ ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
191
+
192
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
193
+ ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
194
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
195
+
196
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
197
+
198
+ ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
199
+
200
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
201
+ ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
202
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
203
+ }
204
+
205
+ const int np2 = (n & ~(ggml_f16_epr - 1));
206
+ for (int k = np; k < np2; k += ggml_f16_epr) {
207
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
208
+
209
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
210
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
211
+ rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
212
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
213
+ }
214
+
215
+ if (np2 < n) {
216
+ svbool_t pg = svwhilelt_b16(np2, n);
217
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
218
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
219
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
220
+
221
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
222
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
223
+ }
224
+ GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
225
+ GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
226
+ #elif defined(__riscv_v_intrinsic)
227
+ // todo: RVV impl
228
+ for (int i = 0; i < n; ++i) {
229
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
230
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
231
+ }
232
+ }
233
+ #else
234
+ const int np = (n & ~(GGML_F16_STEP - 1));
235
+
236
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
237
+
238
+ GGML_F16_VEC ax[GGML_F16_ARR];
239
+ GGML_F16_VEC ay[GGML_F16_ARR];
240
+
241
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
242
+ for (int j = 0; j < GGML_F16_ARR; j++) {
243
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
244
+
245
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
246
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
247
+
248
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
249
+ }
121
250
  }
122
251
  }
123
- }
124
252
 
125
- // reduce sum0..sum3 to sum0
126
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
127
- GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
128
- }
253
+ // reduce sum0..sum3 to sum0
254
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
255
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
256
+ }
129
257
 
130
- // leftovers
131
- for (int i = np; i < n; ++i) {
132
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
133
- sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
258
+ // leftovers
259
+ for (int i = np; i < n; ++i) {
260
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
261
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
262
+ }
134
263
  }
135
- }
264
+ #endif
136
265
  #else
137
266
  for (int i = 0; i < n; ++i) {
138
267
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
139
- sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
268
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
140
269
  }
141
270
  }
142
271
  #endif
@@ -148,27 +277,116 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
148
277
 
149
278
  inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
279
  #if defined(GGML_SIMD)
151
- const int np = (n & ~(GGML_F32_STEP - 1));
280
+ #if defined(__ARM_FEATURE_SVE)
281
+
282
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
283
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
284
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
285
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
286
+
287
+ const int np = (n & ~(ggml_f32_step - 1));
288
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
289
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
290
+ for (int i = 0; i < np; i += ggml_f32_step) {
291
+
292
+ ax1 = GGML_F32_VEC_LOAD(x + i);
293
+ ay1 = GGML_F32_VEC_LOAD(y + i);
294
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
295
+
296
+ GGML_F32_VEC_STORE(y + i, ay1);
297
+
298
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
299
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
300
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
152
301
 
153
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
302
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
154
303
 
155
- GGML_F32_VEC ax[GGML_F32_ARR];
156
- GGML_F32_VEC ay[GGML_F32_ARR];
304
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
305
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
306
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
157
307
 
158
- for (int i = 0; i < np; i += GGML_F32_STEP) {
159
- for (int j = 0; j < GGML_F32_ARR; j++) {
160
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
308
+ GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
163
309
 
164
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
310
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
311
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
312
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
313
+
314
+ GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
315
+
316
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
317
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
318
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
319
+
320
+ GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
321
+
322
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
323
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
324
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
325
+
326
+ GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
327
+
328
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
329
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
330
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
331
+
332
+ GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
333
+
334
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
335
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
336
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
337
+
338
+ GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
165
339
  }
166
- }
340
+ // leftovers
341
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
342
+ const int np2 = (n & ~(ggml_f32_epr - 1));
343
+ for (int i = np; i < np2; i += ggml_f32_epr) {
344
+ ax1 = GGML_F32_VEC_LOAD(x + i);
345
+ ay1 = GGML_F32_VEC_LOAD(y + i);
346
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
347
+
348
+ GGML_F32_VEC_STORE(y + i, ay1);
349
+ }
350
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
351
+ if (np2 < n) {
352
+ svbool_t pg =svwhilelt_b32(np2, n);
353
+ ax1 = svld1_f32(pg, x + np2);
354
+ ay1 = svld1_f32(pg, y + np2);
355
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
356
+
357
+ svst1_f32(pg, y + np2, ay1);
358
+ }
359
+ #elif defined(__riscv_v_intrinsic)
360
+ for (int i = 0, avl; i < n; i += avl) {
361
+ avl = __riscv_vsetvl_e32m8(n - i);
362
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
363
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
364
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
365
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
366
+ }
367
+ #else
368
+ const int np = (n & ~(GGML_F32_STEP - 1));
167
369
 
168
- // leftovers
169
- for (int i = np; i < n; ++i) {
170
- y[i] += x[i]*v;
171
- }
370
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
371
+
372
+ GGML_F32_VEC ax[GGML_F32_ARR];
373
+ GGML_F32_VEC ay[GGML_F32_ARR];
374
+
375
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
376
+ for (int j = 0; j < GGML_F32_ARR; j++) {
377
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
378
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
379
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
380
+
381
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
382
+ }
383
+ }
384
+
385
+ // leftovers
386
+ for (int i = np; i < n; ++i) {
387
+ y[i] += x[i]*v;
388
+ }
389
+ #endif
172
390
  #else
173
391
  // scalar
174
392
  for (int i = 0; i < n; ++i) {
@@ -179,31 +397,116 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
179
397
 
180
398
  inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
181
399
  #if defined(GGML_SIMD)
182
- const int np = (n & ~(GGML_F16_STEP - 1));
400
+ #if defined(__ARM_FEATURE_SVE)
401
+ const int sve_register_length = svcntb() * 8;
402
+ const int ggml_f16_epr = sve_register_length / 16;
403
+ const int ggml_f16_step = 8 * ggml_f16_epr;
404
+
405
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
406
+
407
+ const int np= (n & ~(ggml_f16_step - 1));
408
+
409
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
410
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
411
+ for (int i = 0; i < np; i += ggml_f16_step) {
412
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
413
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
414
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
415
+
416
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
417
+
418
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
419
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
420
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
421
+
422
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
183
423
 
184
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
424
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
425
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
426
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
185
427
 
186
- GGML_F16_VEC ax[GGML_F16_ARR];
187
- GGML_F16_VEC ay[GGML_F16_ARR];
428
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
188
429
 
189
- for (int i = 0; i < np; i += GGML_F16_STEP) {
190
- for (int j = 0; j < GGML_F16_ARR; j++) {
191
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
192
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
193
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
430
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
431
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
432
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
194
433
 
195
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
434
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
435
+
436
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
437
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
438
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
439
+
440
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
441
+
442
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
443
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
444
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
445
+
446
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
447
+
448
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
449
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
450
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
451
+
452
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
453
+
454
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
455
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
456
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
457
+
458
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
196
459
  }
197
- }
460
+ const int np2 = (n & ~(ggml_f16_epr - 1));
461
+ for (int k = np; k < np2; k += ggml_f16_epr) {
462
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
463
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
464
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
198
465
 
199
- // leftovers
200
- for (int i = np; i < n; ++i) {
201
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
202
- }
466
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
467
+ }
468
+
469
+ if (np2 < n) {
470
+ svbool_t pg = svwhilelt_b16(np2, n);
471
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
472
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
473
+ hy = svmad_f16_x(pg, hx, vx, hy);
474
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
475
+ }
476
+
477
+ #elif defined(__riscv_v_intrinsic)
478
+ // todo: RVV impl
479
+ // scalar
480
+ for (int i = 0; i < n; ++i) {
481
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
482
+ }
483
+ #else
484
+ const int np = (n & ~(GGML_F16_STEP - 1));
485
+
486
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
487
+
488
+ GGML_F16_VEC ax[GGML_F16_ARR];
489
+ GGML_F16_VEC ay[GGML_F16_ARR];
490
+
491
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
492
+ for (int j = 0; j < GGML_F16_ARR; j++) {
493
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
494
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
495
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
496
+
497
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
498
+ }
499
+ }
500
+
501
+ // leftovers
502
+ for (int i = np; i < n; ++i) {
503
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
504
+ }
505
+ #endif
203
506
  #else
204
507
  // scalar
205
508
  for (int i = 0; i < n; ++i) {
206
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
509
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
207
510
  }
208
511
  #endif
209
512
  }
@@ -220,36 +523,55 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
220
523
  }
221
524
 
222
525
  #if defined(GGML_SIMD)
223
- const int np = (n & ~(GGML_F32_STEP - 1));
526
+ #if defined(__ARM_FEATURE_SVE)
527
+ // scalar Route to scalar implementation //TODO: Write SVE code
528
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
529
+ for (int i = 0; i < n; ++i) {
530
+ y[i] += x[k][i]*v[k][0];
531
+ }
532
+ }
533
+ #elif defined(__riscv_v_intrinsic)
534
+ for (int i = 0, avl; i < n; i += avl) {
535
+ avl = __riscv_vsetvl_e32m8(n - i);
536
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
537
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
538
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
539
+ ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
540
+ }
541
+ __riscv_vse32_v_f32m8(&y[i], ay, avl);
542
+ }
543
+ #else
544
+ const int np = (n & ~(GGML_F32_STEP - 1));
224
545
 
225
- GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
546
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
547
 
227
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
- vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
- }
548
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
549
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
550
+ }
230
551
 
231
- GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
- GGML_F32_VEC ay[GGML_F32_ARR];
552
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
553
+ GGML_F32_VEC ay[GGML_F32_ARR];
233
554
 
234
- for (int i = 0; i < np; i += GGML_F32_STEP) {
235
- for (int j = 0; j < GGML_F32_ARR; j++) {
236
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
555
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
556
+ for (int j = 0; j < GGML_F32_ARR; j++) {
557
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
558
 
238
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
- ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
- }
559
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
560
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
561
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
562
+ }
242
563
 
243
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
564
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
565
+ }
244
566
  }
245
- }
246
567
 
247
- // leftovers
248
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
- for (int i = np; i < n; ++i) {
250
- y[i] += x[k][i]*v[k][0];
568
+ // leftovers
569
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
570
+ for (int i = np; i < n; ++i) {
571
+ y[i] += x[k][i]*v[k][0];
572
+ }
251
573
  }
252
- }
574
+ #endif
253
575
  #else
254
576
  // scalar
255
577
  for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -260,30 +582,112 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
260
582
  #endif
261
583
  }
262
584
 
585
+ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
586
+ #if defined(GGML_USE_ACCELERATE)
587
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
588
+ #elif defined(GGML_SIMD)
589
+ #if defined(__ARM_FEATURE_SVE)
590
+ // scalar ; TODO: Write SVE code
591
+ for (int i = 0; i < n; ++i) {
592
+ y[i] = x[i]*s + b;
593
+ }
594
+ #elif defined(__riscv_v_intrinsic)
595
+ for (int i = 0, avl; i < n; i += avl) {
596
+ avl = __riscv_vsetvl_e32m8(n - i);
597
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
598
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
599
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
600
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
601
+ }
602
+ #else
603
+ const int np = (n & ~(GGML_F32_STEP - 1));
604
+
605
+ GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
606
+ GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
607
+
608
+ GGML_F32_VEC ay[GGML_F32_ARR];
609
+
610
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
611
+ for (int j = 0; j < GGML_F32_ARR; j++) {
612
+ ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
613
+ ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
614
+
615
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
616
+ }
617
+ }
618
+
619
+ // leftovers
620
+ for (int i = np; i < n; ++i) {
621
+ y[i] = x[i]*s + b;
622
+ }
623
+ #endif
624
+ #else
625
+ // scalar
626
+ for (int i = 0; i < n; ++i) {
627
+ y[i] = x[i]*s + b;
628
+ }
629
+ #endif
630
+ }
631
+
263
632
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
264
633
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
634
  #if defined(GGML_USE_ACCELERATE)
266
635
  vDSP_vsmul(y, 1, &v, y, 1, n);
267
636
  #elif defined(GGML_SIMD)
268
- const int np = (n & ~(GGML_F32_STEP - 1));
637
+ #if defined(__ARM_FEATURE_SVE)
638
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
639
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
640
+ const int ggml_f32_step = 2 * ggml_f32_epr;
641
+
642
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
643
+ const int np = (n & ~(ggml_f32_step - 1));
644
+ svfloat32_t ay1;
645
+ svfloat32_t ay2;
646
+ for (int i = 0; i < np; i += ggml_f32_step) {
647
+ ay1 = GGML_F32_VEC_LOAD(y + i);
648
+ ay1 = GGML_F32_VEC_MUL(ay1, vx);
649
+ GGML_F32_VEC_STORE(y + i, ay1);
650
+
651
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
652
+ ay2 = GGML_F32_VEC_MUL(ay2, vx);
653
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
654
+ }
655
+ // leftovers
656
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657
+ if (np < n) {
658
+ svbool_t pg = svwhilelt_b32(np, n);
659
+ ay1 = svld1_f32(pg, y + np);
660
+ ay1 = svmul_f32_m(pg, ay1, vx);
661
+ svst1_f32(pg, y + np, ay1);
662
+ }
663
+ #elif defined(__riscv_v_intrinsic)
664
+ for (int i = 0, avl; i < n; i += avl) {
665
+ avl = __riscv_vsetvl_e32m8(n - i);
666
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
667
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
668
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
669
+ }
670
+ #else
671
+ const int np = (n & ~(GGML_F32_STEP - 1));
269
672
 
270
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
673
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
674
 
272
- GGML_F32_VEC ay[GGML_F32_ARR];
675
+ GGML_F32_VEC ay[GGML_F32_ARR];
273
676
 
274
- for (int i = 0; i < np; i += GGML_F32_STEP) {
275
- for (int j = 0; j < GGML_F32_ARR; j++) {
276
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
- ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
677
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
678
+ for (int j = 0; j < GGML_F32_ARR; j++) {
679
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
680
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
681
 
279
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
682
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
683
+ }
280
684
  }
281
- }
282
685
 
283
- // leftovers
284
- for (int i = np; i < n; ++i) {
285
- y[i] *= v;
286
- }
686
+ // leftovers
687
+ for (int i = np; i < n; ++i) {
688
+ y[i] *= v;
689
+ }
690
+ #endif
287
691
  #else
288
692
  // scalar
289
693
  for (int i = 0; i < n; ++i) {
@@ -294,29 +698,63 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
294
698
 
295
699
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
296
700
  #if defined(GGML_SIMD)
297
- const int np = (n & ~(GGML_F16_STEP - 1));
701
+ #if defined(__ARM_FEATURE_SVE)
702
+ const int sve_register_length = svcntb() * 8;
703
+ const int ggml_f16_epr = sve_register_length / 16;
704
+ const int ggml_f16_step = 2 * ggml_f16_epr;
705
+
706
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707
+ const int np = (n & ~(ggml_f16_step - 1));
708
+ svfloat16_t ay1, ay2;
709
+
710
+ for (int i = 0; i < np; i += ggml_f16_step) {
711
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714
+
715
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718
+ }
719
+ // leftovers
720
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721
+ if (np < n) {
722
+ svbool_t pg = svwhilelt_b16(np, n);
723
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
725
+ svst1_f16(pg, (__fp16 *)(y + np), out);
726
+ }
727
+ #elif defined(__riscv_v_intrinsic)
728
+ // todo: RVV impl
729
+ // scalar
730
+ for (int i = 0; i < n; ++i) {
731
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
732
+ }
733
+ #else
734
+ const int np = (n & ~(GGML_F16_STEP - 1));
298
735
 
299
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
736
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
300
737
 
301
- GGML_F16_VEC ay[GGML_F16_ARR];
738
+ GGML_F16_VEC ay[GGML_F16_ARR];
302
739
 
303
- for (int i = 0; i < np; i += GGML_F16_STEP) {
304
- for (int j = 0; j < GGML_F16_ARR; j++) {
305
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
306
- ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
740
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
741
+ for (int j = 0; j < GGML_F16_ARR; j++) {
742
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
743
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
307
744
 
308
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
745
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
746
+ }
309
747
  }
310
- }
311
748
 
312
- // leftovers
313
- for (int i = np; i < n; ++i) {
314
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
315
- }
749
+ // leftovers
750
+ for (int i = np; i < n; ++i) {
751
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
752
+ }
753
+ #endif
316
754
  #else
317
755
  // scalar
318
756
  for (int i = 0; i < n; ++i) {
319
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
757
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
320
758
  }
321
759
  #endif
322
760
  }
@@ -325,103 +763,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
325
763
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
326
764
  inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
327
765
  for (int i = 0; i < n; ++i) {
328
- float v = GGML_FP16_TO_FP32(x[i]);
329
- y[i] = GGML_FP32_TO_FP16(v*v);
766
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
767
+ y[i] = GGML_CPU_FP32_TO_FP16(v*v);
330
768
  }
331
769
  }
332
770
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
333
771
  inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
334
772
  for (int i = 0; i < n; ++i) {
335
- y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
773
+ y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
336
774
  }
337
775
  }
338
776
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
339
777
  inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
340
778
  for (int i = 0; i < n; ++i) {
341
- y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
779
+ y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
342
780
  }
343
781
  }
344
782
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
345
783
  inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
346
784
  for (int i = 0; i < n; ++i) {
347
- y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
785
+ y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
348
786
  }
349
787
  }
350
788
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
351
789
  inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
352
790
  for (int i = 0; i < n; ++i) {
353
- y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
791
+ y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
354
792
  }
355
793
  }
356
794
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
357
795
  inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
358
796
  for (int i = 0; i < n; ++i) {
359
- y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
797
+ y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
360
798
  }
361
799
  }
362
800
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
363
801
  inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
364
802
  for (int i = 0; i < n; ++i) {
365
- float v = GGML_FP16_TO_FP32(x[i]);
366
- y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
803
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
804
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
367
805
  }
368
806
  }
369
807
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
370
808
  inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
371
809
  for (int i = 0; i < n; ++i) {
372
- y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
810
+ y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
373
811
  }
374
812
  }
375
813
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
376
814
  inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
377
815
  for (int i = 0; i < n; ++i) {
378
- y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
816
+ y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
379
817
  }
380
818
  }
381
819
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
382
820
  inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
383
821
  for (int i = 0; i < n; ++i) {
384
- y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
822
+ y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
385
823
  }
386
824
  }
387
825
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
388
826
  inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
389
827
  for (int i = 0; i < n; ++i) {
390
- float v = GGML_FP16_TO_FP32(x[i]);
391
- y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
828
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
829
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
392
830
  }
393
831
  }
394
832
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
395
833
  inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
396
834
  for (int i = 0; i < n; ++i) {
397
- float v = GGML_FP16_TO_FP32(x[i]);
398
- y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
835
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
836
+ y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
399
837
  }
400
838
  }
401
839
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
402
840
  inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
403
841
  for (int i = 0; i < n; ++i) {
404
- y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
842
+ y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
405
843
  }
406
844
  }
407
845
  // TODO: optimize performance
408
846
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
409
847
  inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
410
848
  for (int i = 0; i < n; ++i) {
411
- float v = GGML_FP16_TO_FP32(x[i]);
412
- y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
849
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
850
+ y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
413
851
  }
414
852
  }
415
853
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
416
854
  inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
417
855
  for (int i = 0; i < n; ++i) {
418
- y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
856
+ y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
419
857
  }
420
858
  }
421
859
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
422
860
  inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
423
861
  for (int i = 0; i < n; ++i) {
424
- y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
862
+ y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
425
863
  }
426
864
  }
427
865
 
@@ -443,9 +881,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
443
881
 
444
882
  inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
445
883
  for (int i = 0; i < n; ++i) {
446
- float xi = GGML_FP16_TO_FP32(x[i]);
884
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
447
885
  float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
448
- y[i] = GGML_FP32_TO_FP16(res);
886
+ y[i] = GGML_CPU_FP32_TO_FP16(res);
449
887
  }
450
888
  }
451
889
 
@@ -458,9 +896,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
458
896
  } else if (x[i] >= 10.0f) {
459
897
  y[i] = x[i];
460
898
  } else {
461
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
899
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
462
900
  memcpy(&t, &fp16, sizeof(uint16_t));
463
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
901
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
464
902
  }
465
903
  }
466
904
  }
@@ -494,9 +932,9 @@ inline static float ggml_gelu_quick_f32(float x) {
494
932
  inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
495
933
  uint16_t t;
496
934
  for (int i = 0; i < n; ++i) {
497
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
935
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
498
936
  memcpy(&t, &fp16, sizeof(uint16_t));
499
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
937
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
500
938
  }
501
939
  }
502
940
  #else
@@ -509,8 +947,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
509
947
 
510
948
  inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
511
949
  for (int i = 0; i < n; ++i) {
512
- float v = GGML_FP16_TO_FP32(x[i]);
513
- y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
950
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
951
+ y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
514
952
  }
515
953
  }
516
954
 
@@ -519,8 +957,8 @@ inline static float ggml_silu_f32(float x) {
519
957
  return x/(1.0f + expf(-x));
520
958
  }
521
959
  inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
522
- float v = GGML_FP16_TO_FP32(x);
523
- return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
960
+ float v = GGML_CPU_FP16_TO_FP32(x);
961
+ return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
524
962
  }
525
963
 
526
964
  #if __FINITE_MATH_ONLY__
@@ -528,7 +966,75 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
528
966
  #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
529
967
  #endif
530
968
 
531
- #if defined(__ARM_NEON) && defined(__aarch64__)
969
+ /* Below function was borrowed from the GitHub repository:
970
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
971
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
972
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
973
+ // Constants
974
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
975
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
976
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
977
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
978
+ const svfloat32_t one = svdup_n_f32(1.0f);
979
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
980
+ const svint32_t inactive2 = svdup_n_s32(0);
981
+
982
+ // Algorithm starts here
983
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
984
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
985
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
986
+
987
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
988
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
989
+
990
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
991
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
992
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
993
+
994
+ // and_(t2.d, t1.d, not_mask17.d)
995
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
996
+ t5 = svsub_f32_m(pg, t1, t5); // z
997
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
998
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
999
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
1000
+
1001
+ return t0;
1002
+ }
1003
+ #endif
1004
+
1005
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1006
+
1007
+ inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1008
+ const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1009
+ const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1010
+ const svfloat32_t n = svsub_f32_x(pg, z, r);
1011
+ const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1012
+ const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1013
+ const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1014
+ const svbool_t c = svacgt_n_f32(pg, n, 126);
1015
+ const svfloat32_t u = svmul_f32_x(pg, b, b);
1016
+ const svfloat32_t j = svmla_f32_x(pg,
1017
+ svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1018
+ svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1019
+ svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1020
+ const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1021
+ const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1022
+ const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1023
+ return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1024
+ svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1025
+ }
1026
+
1027
+ // computes silu x/(1+exp(-x)) in single precision vector
1028
+ inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1029
+ const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1030
+ const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1031
+ const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1032
+ const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1033
+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1034
+ return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1035
+ }
1036
+
1037
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
532
1038
 
533
1039
  // adapted from arm limited optimized routine
534
1040
  // the maximum error is 1.45358 plus 0.5 ulps
@@ -719,7 +1225,59 @@ inline static __m128 ggml_v_silu(__m128 x) {
719
1225
  return _mm_div_ps(x, one_plus_exp_neg_x);
720
1226
  }
721
1227
 
722
- #endif // __ARM_NEON / __AVX2__ / __SSE2__
1228
+ #elif defined(__riscv_v_intrinsic)
1229
+
1230
+ // adapted from arm limited optimized routine
1231
+ // the maximum error is 1.45358 plus 0.5 ulps
1232
+ // numbers above 88.38 will flush to infinity
1233
+ // numbers beneath -103.97 will flush to zero
1234
+ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1235
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1236
+ #ifdef __riscv_xtheadvector
1237
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1238
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1239
+ z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1240
+ #else
1241
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1242
+ #endif
1243
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1244
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1245
+ 0x1.7f7d1cp-20f, n, vl);
1246
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1247
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1248
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1249
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1250
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1251
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1252
+ __riscv_vfmacc_vv_f32m2(
1253
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1254
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1255
+ u, vl), u, vl);
1256
+ if (!__riscv_vcpop_m_b16(c, vl))
1257
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1258
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1259
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1260
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1261
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1262
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1263
+ __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1264
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1265
+ c, vl);
1266
+ return __riscv_vmerge_vvm_f32m2(
1267
+ r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1268
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1269
+ vl);
1270
+ }
1271
+
1272
+ // computes silu x/(1+exp(-x)) in single precision vector
1273
+ inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1274
+ const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1275
+ const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1276
+ const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1277
+ return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1278
+ }
1279
+
1280
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
723
1281
 
724
1282
  inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
725
1283
  for (int i = 0; i < n; ++i) {
@@ -733,9 +1291,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
733
1291
  }
734
1292
 
735
1293
  inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
736
- const float v = GGML_FP16_TO_FP32(x);
1294
+ const float v = GGML_CPU_FP16_TO_FP32(x);
737
1295
  const float s = 1.0f/(1.0f + expf(-v));
738
- return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1296
+ return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
739
1297
  }
740
1298
 
741
1299
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
@@ -750,6 +1308,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
750
1308
  }
751
1309
  }
752
1310
 
1311
+ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1312
+ for (int i = 0; i < n; ++i) {
1313
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1314
+ }
1315
+ }
1316
+
1317
+ inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1318
+ for (int i = 0; i < n; ++i) {
1319
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
1320
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1321
+ }
1322
+ }
1323
+
1324
+ #ifdef GGML_GELU_FP16
1325
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1326
+ uint16_t t;
1327
+ for (int i = 0; i < n; ++i) {
1328
+ if (x[i] <= -10.0f) {
1329
+ y[i] = 0.0f;
1330
+ } else if (x[i] >= 10.0f) {
1331
+ y[i] = x[i] * g[i];
1332
+ } else {
1333
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1334
+ memcpy(&t, &fp16, sizeof(uint16_t));
1335
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1336
+ }
1337
+ }
1338
+ }
1339
+ #else
1340
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1341
+ for (int i = 0; i < n; ++i) {
1342
+ y[i] = ggml_gelu_f32(x[i]) * g[i];
1343
+ }
1344
+ }
1345
+ #endif
1346
+
1347
+ inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1348
+ const uint16_t * i16 = (const uint16_t *) x;
1349
+ for (int i = 0; i < n; ++i) {
1350
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1351
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1352
+ }
1353
+ }
1354
+
1355
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1356
+
1357
+ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1358
+ for (int i = 0; i < n; ++i) {
1359
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1360
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1361
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1362
+ }
1363
+ }
1364
+
1365
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1366
+ for (int i = 0; i < n; ++i) {
1367
+ float xi = x[i];
1368
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1369
+ }
1370
+ }
1371
+
1372
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1373
+ for (int i = 0; i < n; ++i) {
1374
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1375
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1376
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1377
+ }
1378
+ }
1379
+
1380
+ #ifdef GGML_GELU_QUICK_FP16
1381
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1382
+ uint16_t t;
1383
+ for (int i = 0; i < n; ++i) {
1384
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1385
+ memcpy(&t, &fp16, sizeof(uint16_t));
1386
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1387
+ }
1388
+ }
1389
+ #else
1390
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1391
+ for (int i = 0; i < n; ++i) {
1392
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1393
+ }
1394
+ }
1395
+ #endif
1396
+
1397
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1398
+ const uint16_t * i16 = (const uint16_t *) x;
1399
+ for (int i = 0; i < n; ++i) {
1400
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1401
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1402
+ }
1403
+ }
1404
+
753
1405
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
754
1406
  #ifndef GGML_USE_ACCELERATE
755
1407
  ggml_float sum = 0.0;
@@ -773,7 +1425,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
773
1425
  inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
774
1426
  float sum = 0.0f;
775
1427
  for (int i = 0; i < n; ++i) {
776
- sum += GGML_FP16_TO_FP32(x[i]);
1428
+ sum += GGML_CPU_FP16_TO_FP32(x[i]);
777
1429
  }
778
1430
  *s = sum;
779
1431
  }