whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -5,6 +5,7 @@
5
5
  #include "ggml-impl.h"
6
6
  #include "simd-mappings.h"
7
7
  #include "ggml.h"
8
+ #include "ggml-cpu.h"
8
9
 
9
10
  #if defined(GGML_USE_ACCELERATE)
10
11
  #include <Accelerate/Accelerate.h>
@@ -57,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
57
58
  inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
59
  inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
59
60
  for (int i = 0; i < n; ++i) {
60
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
61
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
61
62
  }
62
63
  }
63
64
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
@@ -66,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
66
67
  inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
67
68
  inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
68
69
  for (int i = 0; i < n; ++i) {
69
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
70
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
70
71
  }
71
72
  }
72
73
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
@@ -74,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
74
75
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
75
76
  inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
76
77
  for (int i = 0; i < n; ++i) {
77
- y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
78
+ y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
78
79
  }
79
80
  }
80
81
 
81
82
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
82
83
  inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
83
84
  for (int i = 0; i < n; ++i) {
84
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
85
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
85
86
  }
86
87
  }
87
88
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
88
89
  inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
89
90
  for (int i = 0; i < n; ++i) {
90
- z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
91
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
91
92
  }
92
93
  }
93
94
 
@@ -130,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
130
131
  // leftovers
131
132
  for (int i = np; i < n; ++i) {
132
133
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
133
- sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
134
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
134
135
  }
135
136
  }
136
137
  #else
137
138
  for (int i = 0; i < n; ++i) {
138
139
  for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
139
- sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
140
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
140
141
  }
141
142
  }
142
143
  #endif
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
148
149
 
149
150
  inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
151
  #if defined(GGML_SIMD)
151
- const int np = (n & ~(GGML_F32_STEP - 1));
152
+ #if defined(__ARM_FEATURE_SVE)
152
153
 
153
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
155
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
156
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
157
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
158
 
155
- GGML_F32_VEC ax[GGML_F32_ARR];
156
- GGML_F32_VEC ay[GGML_F32_ARR];
159
+ const int np = (n & ~(ggml_f32_step - 1));
160
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
161
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
162
+ for (int i = 0; i < np; i += ggml_f32_step) {
157
163
 
158
- for (int i = 0; i < np; i += GGML_F32_STEP) {
159
- for (int j = 0; j < GGML_F32_ARR; j++) {
160
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
164
+ ax1 = GGML_F32_VEC_LOAD(x + i);
165
+ ay1 = GGML_F32_VEC_LOAD(y + i);
166
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
163
167
 
164
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
168
+ GGML_F32_VEC_STORE(y + i, ay1);
169
+
170
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
+ ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
173
+
174
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
+
176
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
+ ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
179
+
180
+ GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
+
182
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
+ ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
185
+
186
+ GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
+
188
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
+ ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
191
+
192
+ GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
+
194
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
+ ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
197
+
198
+ GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
+
200
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
+ ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
203
+
204
+ GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
+
206
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
+ ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
209
+
210
+ GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
165
211
  }
166
- }
212
+ // leftovers
213
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
214
+ const int np2 = (n & ~(ggml_f32_epr - 1));
215
+ for (int i = np; i < np2; i += ggml_f32_epr) {
216
+ ax1 = GGML_F32_VEC_LOAD(x + i);
217
+ ay1 = GGML_F32_VEC_LOAD(y + i);
218
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
219
+
220
+ GGML_F32_VEC_STORE(y + i, ay1);
221
+ }
222
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
223
+ if (np2 < n) {
224
+ svbool_t pg =svwhilelt_b32(np2, n);
225
+ ax1 = svld1_f32(pg, x + np2);
226
+ ay1 = svld1_f32(pg, y + np2);
227
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
228
+
229
+ svst1_f32(pg, y + np2, ay1);
230
+ }
231
+ #else
232
+ const int np = (n & ~(GGML_F32_STEP - 1));
167
233
 
168
- // leftovers
169
- for (int i = np; i < n; ++i) {
170
- y[i] += x[i]*v;
171
- }
234
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
235
+
236
+ GGML_F32_VEC ax[GGML_F32_ARR];
237
+ GGML_F32_VEC ay[GGML_F32_ARR];
238
+
239
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
240
+ for (int j = 0; j < GGML_F32_ARR; j++) {
241
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
242
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
243
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
244
+
245
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
246
+ }
247
+ }
248
+
249
+ // leftovers
250
+ for (int i = np; i < n; ++i) {
251
+ y[i] += x[i]*v;
252
+ }
253
+ #endif
172
254
  #else
173
255
  // scalar
174
256
  for (int i = 0; i < n; ++i) {
@@ -198,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
198
280
 
199
281
  // leftovers
200
282
  for (int i = np; i < n; ++i) {
201
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
283
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
202
284
  }
203
285
  #else
204
286
  // scalar
205
287
  for (int i = 0; i < n; ++i) {
206
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
288
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
207
289
  }
208
290
  #endif
209
291
  }
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
220
302
  }
221
303
 
222
304
  #if defined(GGML_SIMD)
223
- const int np = (n & ~(GGML_F32_STEP - 1));
305
+ #if defined(__ARM_FEATURE_SVE)
306
+ // scalar Route to scalar implementation //TODO: Write SVE code
307
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
308
+ for (int i = 0; i < n; ++i) {
309
+ y[i] += x[k][i]*v[k][0];
310
+ }
311
+ }
312
+ #else
313
+ const int np = (n & ~(GGML_F32_STEP - 1));
224
314
 
225
- GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
315
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
316
 
227
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
- vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
- }
317
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
318
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
319
+ }
230
320
 
231
- GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
- GGML_F32_VEC ay[GGML_F32_ARR];
321
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
322
+ GGML_F32_VEC ay[GGML_F32_ARR];
233
323
 
234
- for (int i = 0; i < np; i += GGML_F32_STEP) {
235
- for (int j = 0; j < GGML_F32_ARR; j++) {
236
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
324
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
325
+ for (int j = 0; j < GGML_F32_ARR; j++) {
326
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
327
 
238
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
- ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
- }
328
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
329
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
330
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
331
+ }
242
332
 
243
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
333
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
334
+ }
244
335
  }
245
- }
246
336
 
247
- // leftovers
248
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
- for (int i = np; i < n; ++i) {
250
- y[i] += x[k][i]*v[k][0];
337
+ // leftovers
338
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
339
+ for (int i = np; i < n; ++i) {
340
+ y[i] += x[k][i]*v[k][0];
341
+ }
251
342
  }
252
- }
343
+ #endif
253
344
  #else
254
345
  // scalar
255
346
  for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
356
  #if defined(GGML_USE_ACCELERATE)
266
357
  vDSP_vsmul(y, 1, &v, y, 1, n);
267
358
  #elif defined(GGML_SIMD)
268
- const int np = (n & ~(GGML_F32_STEP - 1));
359
+ #if defined(__ARM_FEATURE_SVE)
360
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
361
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
362
+ const int ggml_f32_step = 2 * ggml_f32_epr;
363
+
364
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
365
+ const int np = (n & ~(ggml_f32_step - 1));
366
+ svfloat32_t ay1;
367
+ svfloat32_t ay2;
368
+ for (int i = 0; i < np; i += ggml_f32_step) {
369
+ ay1 = GGML_F32_VEC_LOAD(y + i);
370
+ ay1 = GGML_F32_VEC_MUL(ay1, vx);
371
+ GGML_F32_VEC_STORE(y + i, ay1);
372
+
373
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
374
+ ay2 = GGML_F32_VEC_MUL(ay2, vx);
375
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
376
+ }
377
+ // leftovers
378
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
379
+ if (np < n) {
380
+ svbool_t pg = svwhilelt_b32(np, n);
381
+ ay1 = svld1_f32(pg, y + np);
382
+ ay1 = svmul_f32_m(pg, ay1, vx);
383
+ svst1_f32(pg, y + np, ay1);
384
+ }
385
+ #else
386
+ const int np = (n & ~(GGML_F32_STEP - 1));
269
387
 
270
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
388
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
389
 
272
- GGML_F32_VEC ay[GGML_F32_ARR];
390
+ GGML_F32_VEC ay[GGML_F32_ARR];
273
391
 
274
- for (int i = 0; i < np; i += GGML_F32_STEP) {
275
- for (int j = 0; j < GGML_F32_ARR; j++) {
276
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
- ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
392
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
393
+ for (int j = 0; j < GGML_F32_ARR; j++) {
394
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
395
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
396
 
279
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
397
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
398
+ }
280
399
  }
281
- }
282
400
 
283
- // leftovers
284
- for (int i = np; i < n; ++i) {
285
- y[i] *= v;
286
- }
401
+ // leftovers
402
+ for (int i = np; i < n; ++i) {
403
+ y[i] *= v;
404
+ }
405
+ #endif
287
406
  #else
288
407
  // scalar
289
408
  for (int i = 0; i < n; ++i) {
@@ -311,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
311
430
 
312
431
  // leftovers
313
432
  for (int i = np; i < n; ++i) {
314
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
433
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
315
434
  }
316
435
  #else
317
436
  // scalar
318
437
  for (int i = 0; i < n; ++i) {
319
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
438
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
320
439
  }
321
440
  #endif
322
441
  }
@@ -325,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
325
444
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
326
445
  inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
327
446
  for (int i = 0; i < n; ++i) {
328
- float v = GGML_FP16_TO_FP32(x[i]);
329
- y[i] = GGML_FP32_TO_FP16(v*v);
447
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
448
+ y[i] = GGML_CPU_FP32_TO_FP16(v*v);
330
449
  }
331
450
  }
332
451
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
333
452
  inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
334
453
  for (int i = 0; i < n; ++i) {
335
- y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
454
+ y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
336
455
  }
337
456
  }
338
457
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
339
458
  inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
340
459
  for (int i = 0; i < n; ++i) {
341
- y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
460
+ y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
342
461
  }
343
462
  }
344
463
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
345
464
  inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
346
465
  for (int i = 0; i < n; ++i) {
347
- y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
466
+ y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
348
467
  }
349
468
  }
350
469
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
351
470
  inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
352
471
  for (int i = 0; i < n; ++i) {
353
- y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
472
+ y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
354
473
  }
355
474
  }
356
475
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
357
476
  inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
358
477
  for (int i = 0; i < n; ++i) {
359
- y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
478
+ y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
360
479
  }
361
480
  }
362
481
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
363
482
  inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
364
483
  for (int i = 0; i < n; ++i) {
365
- float v = GGML_FP16_TO_FP32(x[i]);
366
- y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
484
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
485
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
367
486
  }
368
487
  }
369
488
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
370
489
  inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
371
490
  for (int i = 0; i < n; ++i) {
372
- y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
491
+ y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
373
492
  }
374
493
  }
375
494
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
376
495
  inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
377
496
  for (int i = 0; i < n; ++i) {
378
- y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
497
+ y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
379
498
  }
380
499
  }
381
500
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
382
501
  inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
383
502
  for (int i = 0; i < n; ++i) {
384
- y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
503
+ y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
385
504
  }
386
505
  }
387
506
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
388
507
  inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
389
508
  for (int i = 0; i < n; ++i) {
390
- float v = GGML_FP16_TO_FP32(x[i]);
391
- y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
509
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
510
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
392
511
  }
393
512
  }
394
513
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
395
514
  inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
396
515
  for (int i = 0; i < n; ++i) {
397
- float v = GGML_FP16_TO_FP32(x[i]);
398
- y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
516
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
517
+ y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
399
518
  }
400
519
  }
401
520
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
402
521
  inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
403
522
  for (int i = 0; i < n; ++i) {
404
- y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
523
+ y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
405
524
  }
406
525
  }
407
526
  // TODO: optimize performance
408
527
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
409
528
  inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
410
529
  for (int i = 0; i < n; ++i) {
411
- float v = GGML_FP16_TO_FP32(x[i]);
412
- y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
530
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
531
+ y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
413
532
  }
414
533
  }
415
534
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
416
535
  inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
417
536
  for (int i = 0; i < n; ++i) {
418
- y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
537
+ y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
419
538
  }
420
539
  }
421
540
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
422
541
  inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
423
542
  for (int i = 0; i < n; ++i) {
424
- y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
543
+ y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
425
544
  }
426
545
  }
427
546
 
@@ -443,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
443
562
 
444
563
  inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
445
564
  for (int i = 0; i < n; ++i) {
446
- float xi = GGML_FP16_TO_FP32(x[i]);
565
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
447
566
  float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
448
- y[i] = GGML_FP32_TO_FP16(res);
567
+ y[i] = GGML_CPU_FP32_TO_FP16(res);
449
568
  }
450
569
  }
451
570
 
@@ -458,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
458
577
  } else if (x[i] >= 10.0f) {
459
578
  y[i] = x[i];
460
579
  } else {
461
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
580
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
462
581
  memcpy(&t, &fp16, sizeof(uint16_t));
463
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
582
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
464
583
  }
465
584
  }
466
585
  }
@@ -494,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) {
494
613
  inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
495
614
  uint16_t t;
496
615
  for (int i = 0; i < n; ++i) {
497
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
616
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
498
617
  memcpy(&t, &fp16, sizeof(uint16_t));
499
- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
618
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
500
619
  }
501
620
  }
502
621
  #else
@@ -509,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
509
628
 
510
629
  inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
511
630
  for (int i = 0; i < n; ++i) {
512
- float v = GGML_FP16_TO_FP32(x[i]);
513
- y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
631
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
632
+ y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
514
633
  }
515
634
  }
516
635
 
@@ -519,8 +638,8 @@ inline static float ggml_silu_f32(float x) {
519
638
  return x/(1.0f + expf(-x));
520
639
  }
521
640
  inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
522
- float v = GGML_FP16_TO_FP32(x);
523
- return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
641
+ float v = GGML_CPU_FP16_TO_FP32(x);
642
+ return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
524
643
  }
525
644
 
526
645
  #if __FINITE_MATH_ONLY__
@@ -528,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
528
647
  #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
529
648
  #endif
530
649
 
650
+ /* Below function was borrowed from the GitHub repository:
651
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
652
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
653
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
654
+ // Constants
655
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
656
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
657
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
658
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
659
+ const svfloat32_t one = svdup_n_f32(1.0f);
660
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
661
+ const svint32_t inactive2 = svdup_n_s32(0);
662
+
663
+ // Algorithm starts here
664
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
665
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
666
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
667
+
668
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
669
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
670
+
671
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
672
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
673
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
674
+
675
+ // and_(t2.d, t1.d, not_mask17.d)
676
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
677
+ t5 = svsub_f32_m(pg, t1, t5); // z
678
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
679
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
680
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
681
+
682
+ return t0;
683
+ }
684
+ #endif
685
+
531
686
  #if defined(__ARM_NEON) && defined(__aarch64__)
532
687
 
533
688
  // adapted from arm limited optimized routine
@@ -733,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
733
888
  }
734
889
 
735
890
  inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
736
- const float v = GGML_FP16_TO_FP32(x);
891
+ const float v = GGML_CPU_FP16_TO_FP32(x);
737
892
  const float s = 1.0f/(1.0f + expf(-v));
738
- return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
893
+ return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
739
894
  }
740
895
 
741
896
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
@@ -750,6 +905,60 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
750
905
  }
751
906
  }
752
907
 
908
+ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
909
+ for (int i = 0; i < n; ++i) {
910
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
911
+ }
912
+ }
913
+
914
+ inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
915
+ for (int i = 0; i < n; ++i) {
916
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
917
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
918
+ }
919
+ }
920
+
921
+ #ifdef GGML_GELU_FP16
922
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
923
+ uint16_t t;
924
+ for (int i = 0; i < n; ++i) {
925
+ if (x[i] <= -10.0f) {
926
+ y[i] = 0.0f;
927
+ } else if (x[i] >= 10.0f) {
928
+ y[i] = x[i] * g[i];
929
+ } else {
930
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
931
+ memcpy(&t, &fp16, sizeof(uint16_t));
932
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
933
+ }
934
+ }
935
+ }
936
+ #else
937
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
938
+ for (int i = 0; i < n; ++i) {
939
+ y[i] = ggml_gelu_f32(x[i]) * g[i];
940
+ }
941
+ }
942
+ #endif
943
+
944
+ inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
945
+ const uint16_t * i16 = (const uint16_t *) x;
946
+ for (int i = 0; i < n; ++i) {
947
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
948
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
949
+ }
950
+ }
951
+
952
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
953
+
954
+ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
955
+ for (int i = 0; i < n; ++i) {
956
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
957
+ float w = GGML_CPU_FP16_TO_FP32(g[i]);
958
+ y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
959
+ }
960
+ }
961
+
753
962
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
754
963
  #ifndef GGML_USE_ACCELERATE
755
964
  ggml_float sum = 0.0;
@@ -773,7 +982,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
773
982
  inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
774
983
  float sum = 0.0f;
775
984
  for (int i = 0; i < n; ++i) {
776
- sum += GGML_FP16_TO_FP32(x[i]);
985
+ sum += GGML_CPU_FP16_TO_FP32(x[i]);
777
986
  }
778
987
  *s = sum;
779
988
  }