whisper.rn 0.4.0-rc.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. package/README.md +5 -1
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +44 -13
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -38
  7. package/android/src/main/jni.cpp +38 -1
  8. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  15. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  16. package/cpp/coreml/whisper-compat.h +10 -0
  17. package/cpp/coreml/whisper-compat.m +35 -0
  18. package/cpp/coreml/whisper-decoder-impl.h +27 -15
  19. package/cpp/coreml/whisper-decoder-impl.m +36 -10
  20. package/cpp/coreml/whisper-encoder-impl.h +21 -9
  21. package/cpp/coreml/whisper-encoder-impl.m +29 -3
  22. package/cpp/ggml-alloc.c +727 -517
  23. package/cpp/ggml-alloc.h +47 -65
  24. package/cpp/ggml-backend-impl.h +196 -57
  25. package/cpp/ggml-backend-reg.cpp +591 -0
  26. package/cpp/ggml-backend.cpp +2016 -0
  27. package/cpp/ggml-backend.h +234 -89
  28. package/cpp/ggml-common.h +1861 -0
  29. package/cpp/ggml-cpp.h +39 -0
  30. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  31. package/cpp/ggml-cpu/amx/amx.h +8 -0
  32. package/cpp/ggml-cpu/amx/common.h +91 -0
  33. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  34. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  35. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  36. package/cpp/ggml-cpu/arch/arm/quants.c +4113 -0
  37. package/cpp/ggml-cpu/arch/arm/repack.cpp +2162 -0
  38. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  39. package/cpp/ggml-cpu/arch/x86/quants.c +4310 -0
  40. package/cpp/ggml-cpu/arch/x86/repack.cpp +3284 -0
  41. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  42. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  43. package/cpp/ggml-cpu/binary-ops.h +16 -0
  44. package/cpp/ggml-cpu/common.h +72 -0
  45. package/cpp/ggml-cpu/ggml-cpu-impl.h +511 -0
  46. package/cpp/ggml-cpu/ggml-cpu.c +3473 -0
  47. package/cpp/ggml-cpu/ggml-cpu.cpp +671 -0
  48. package/cpp/ggml-cpu/ops.cpp +9085 -0
  49. package/cpp/ggml-cpu/ops.h +111 -0
  50. package/cpp/ggml-cpu/quants.c +1157 -0
  51. package/cpp/ggml-cpu/quants.h +89 -0
  52. package/cpp/ggml-cpu/repack.cpp +1570 -0
  53. package/cpp/ggml-cpu/repack.h +98 -0
  54. package/cpp/ggml-cpu/simd-mappings.h +1006 -0
  55. package/cpp/ggml-cpu/traits.cpp +36 -0
  56. package/cpp/ggml-cpu/traits.h +38 -0
  57. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  58. package/cpp/ggml-cpu/unary-ops.h +28 -0
  59. package/cpp/ggml-cpu/vec.cpp +321 -0
  60. package/cpp/ggml-cpu/vec.h +973 -0
  61. package/cpp/ggml-cpu.h +143 -0
  62. package/cpp/ggml-impl.h +525 -168
  63. package/cpp/ggml-metal-impl.h +622 -0
  64. package/cpp/ggml-metal.h +16 -14
  65. package/cpp/ggml-metal.m +5289 -1859
  66. package/cpp/ggml-opt.cpp +1037 -0
  67. package/cpp/ggml-opt.h +237 -0
  68. package/cpp/ggml-quants.c +2916 -6877
  69. package/cpp/ggml-quants.h +87 -249
  70. package/cpp/ggml-threading.cpp +12 -0
  71. package/cpp/ggml-threading.h +14 -0
  72. package/cpp/ggml-whisper-sim.metallib +0 -0
  73. package/cpp/ggml-whisper.metallib +0 -0
  74. package/cpp/ggml.c +3293 -16770
  75. package/cpp/ggml.h +778 -835
  76. package/cpp/gguf.cpp +1347 -0
  77. package/cpp/gguf.h +202 -0
  78. package/cpp/rn-whisper.cpp +84 -0
  79. package/cpp/rn-whisper.h +2 -0
  80. package/cpp/whisper-arch.h +197 -0
  81. package/cpp/whisper.cpp +3240 -944
  82. package/cpp/whisper.h +144 -31
  83. package/ios/CMakeLists.txt +95 -0
  84. package/ios/RNWhisper.h +5 -0
  85. package/ios/RNWhisper.mm +124 -37
  86. package/ios/RNWhisperAudioUtils.h +1 -0
  87. package/ios/RNWhisperAudioUtils.m +24 -13
  88. package/ios/RNWhisperContext.h +8 -2
  89. package/ios/RNWhisperContext.mm +42 -8
  90. package/ios/rnwhisper.xcframework/Info.plist +74 -0
  91. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  92. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  93. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  94. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  95. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  96. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  97. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  98. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  99. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  100. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  101. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  102. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  103. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  104. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  105. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  106. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  107. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  108. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  109. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  110. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  111. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  112. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  113. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  114. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  115. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  116. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  117. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  118. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  119. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  120. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  121. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  122. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  123. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  124. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  125. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  126. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  127. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  128. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  129. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  130. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  131. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  132. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  133. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  134. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  135. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  136. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  137. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  138. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  139. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  140. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  141. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  142. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  143. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  144. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  145. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  146. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  147. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  148. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
  149. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
  150. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  151. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  152. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  153. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  154. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
  155. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  156. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  157. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  158. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
  159. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
  160. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
  161. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
  162. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
  163. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
  164. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
  165. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
  166. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
  167. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
  168. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
  169. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
  170. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
  171. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
  172. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
  173. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
  174. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
  175. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
  176. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
  177. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  178. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
  179. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  180. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  181. package/jest/mock.js +14 -1
  182. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  183. package/lib/commonjs/index.js +48 -19
  184. package/lib/commonjs/index.js.map +1 -1
  185. package/lib/commonjs/version.json +1 -1
  186. package/lib/module/NativeRNWhisper.js.map +1 -1
  187. package/lib/module/index.js +48 -19
  188. package/lib/module/index.js.map +1 -1
  189. package/lib/module/version.json +1 -1
  190. package/lib/typescript/NativeRNWhisper.d.ts +6 -3
  191. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  192. package/lib/typescript/index.d.ts +25 -3
  193. package/lib/typescript/index.d.ts.map +1 -1
  194. package/package.json +15 -10
  195. package/src/NativeRNWhisper.ts +12 -3
  196. package/src/index.ts +63 -24
  197. package/src/version.json +1 -1
  198. package/whisper-rn.podspec +18 -18
  199. package/cpp/README.md +0 -4
  200. package/cpp/ggml-backend.c +0 -1718
  201. package/cpp/ggml-metal-whisper.metal +0 -5820
@@ -0,0 +1,973 @@
1
+ // Vectorized functions for fundamental operations
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-impl.h"
6
+ #include "simd-mappings.h"
7
+ #include "ggml.h"
8
+ #include "ggml-cpu.h"
9
+
10
+ #if defined(WSP_GGML_USE_ACCELERATE)
11
+ #include <Accelerate/Accelerate.h>
12
+ #endif
13
+
14
+ // floating point type used to accumulate sums
15
+ typedef double wsp_ggml_float;
16
+
17
+ #define WSP_GGML_GELU_FP16
18
+ #define WSP_GGML_GELU_QUICK_FP16
19
+
20
+ #define WSP_GGML_SOFT_MAX_UNROLL 4
21
+ #define WSP_GGML_VEC_DOT_UNROLL 2
22
+ #define WSP_GGML_VEC_MAD_UNROLL 32
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ //
29
+ // global data
30
+ //
31
+
32
+ // precomputed gelu table for f16 (128 KB)
33
+ extern wsp_ggml_fp16_t wsp_ggml_table_gelu_f16[1 << 16];
34
+
35
+ // precomputed quick gelu table for f16 (128 KB)
36
+ extern wsp_ggml_fp16_t wsp_ggml_table_gelu_quick_f16[1 << 16];
37
+
38
+ //
39
+ // fundamental operations
40
+ //
41
+
42
+ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const float * WSP_GGML_RESTRICT x, size_t bx, const float * WSP_GGML_RESTRICT y, size_t by, int nrc);
43
+ void wsp_ggml_vec_dot_bf16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_bf16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_bf16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
44
+ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
45
+
46
+ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x);
47
+ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
48
+ wsp_ggml_float wsp_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
49
+
50
+ inline static void wsp_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
51
+ inline static void wsp_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
52
+
53
+ inline static void wsp_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
54
+ inline static void wsp_ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
55
+
56
+ inline static void wsp_ggml_vec_set_f16(const int n, wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
+ inline static void wsp_ggml_vec_set_bf16(const int n, wsp_ggml_bf16_t * x, const wsp_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
+ inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
59
+ inline static void wsp_ggml_vec_add_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
60
+ for (int i = 0; i < n; ++i) {
61
+ z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) + WSP_GGML_FP16_TO_FP32(y[i]));
62
+ }
63
+ }
64
+ inline static void wsp_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
65
+ inline static void wsp_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
66
+ inline static void wsp_ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
67
+ inline static void wsp_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
68
+ inline static void wsp_ggml_vec_sub_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
69
+ for (int i = 0; i < n; ++i) {
70
+ z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) - WSP_GGML_FP16_TO_FP32(y[i]));
71
+ }
72
+ }
73
+ inline static void wsp_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
74
+ inline static void wsp_ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
75
+ inline static void wsp_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
76
+ inline static void wsp_ggml_vec_neg_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
77
+ for (int i = 0; i < n; ++i) {
78
+ y[i] = WSP_GGML_FP32_TO_FP16(-WSP_GGML_FP16_TO_FP32(x[i]));
79
+ }
80
+ }
81
+
82
+ inline static void wsp_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
83
+ inline static void wsp_ggml_vec_mul_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
84
+ for (int i = 0; i < n; ++i) {
85
+ z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) * WSP_GGML_FP16_TO_FP32(y[i]));
86
+ }
87
+ }
88
+ inline static void wsp_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
89
+ inline static void wsp_ggml_vec_div_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
90
+ for (int i = 0; i < n; ++i) {
91
+ z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) / WSP_GGML_FP16_TO_FP32(y[i]));
92
+ }
93
+ }
94
+
95
+ // compute WSP_GGML_VEC_DOT_UNROLL dot products at once
96
+ // xs - x row stride in bytes
97
+ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float * WSP_GGML_RESTRICT s, void * WSP_GGML_RESTRICT xv, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y) {
98
+ wsp_ggml_float sumf[WSP_GGML_VEC_DOT_UNROLL] = { 0.0 };
99
+
100
+ wsp_ggml_fp16_t * WSP_GGML_RESTRICT x[WSP_GGML_VEC_DOT_UNROLL];
101
+
102
+ for (int i = 0; i < WSP_GGML_VEC_DOT_UNROLL; ++i) {
103
+ x[i] = (wsp_ggml_fp16_t *) ((char *) xv + i*xs);
104
+ }
105
+
106
+ #if defined(WSP_GGML_SIMD)
107
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
108
+
109
+ WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
110
+
111
+ WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
112
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
113
+
114
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
115
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
116
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
117
+
118
+ for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
119
+ ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
120
+
121
+ sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
122
+ }
123
+ }
124
+ }
125
+
126
+ // reduce sum0..sum3 to sum0
127
+ for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
128
+ WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
129
+ }
130
+
131
+ // leftovers
132
+ for (int i = np; i < n; ++i) {
133
+ for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
134
+ sumf[j] += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[j][i])*WSP_GGML_FP16_TO_FP32(y[i]));
135
+ }
136
+ }
137
+ #else
138
+ for (int i = 0; i < n; ++i) {
139
+ for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
140
+ sumf[j] += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[j][i])*WSP_GGML_FP16_TO_FP32(y[i]));
141
+ }
142
+ }
143
+ #endif
144
+
145
+ for (int i = 0; i < WSP_GGML_VEC_DOT_UNROLL; ++i) {
146
+ s[i] = (float)sumf[i];
147
+ }
148
+ }
149
+
150
+ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y, const float * WSP_GGML_RESTRICT x, const float v) {
151
+ #if defined(WSP_GGML_SIMD)
152
+ #if defined(__ARM_FEATURE_SVE)
153
+
154
+ const int sve_register_length = wsp_ggml_cpu_get_sve_cnt() * 8;
155
+ const int wsp_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
156
+ const int wsp_ggml_f32_step = 8 * wsp_ggml_f32_epr; // choose 8 SVE registers
157
+ WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v);
158
+
159
+ const int np = (n & ~(wsp_ggml_f32_step - 1));
160
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
161
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
162
+ for (int i = 0; i < np; i += wsp_ggml_f32_step) {
163
+
164
+ ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
165
+ ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
166
+ ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
167
+
168
+ WSP_GGML_F32_VEC_STORE(y + i, ay1);
169
+
170
+ ax2 = WSP_GGML_F32_VEC_LOAD(x + i + 1*wsp_ggml_f32_epr);
171
+ ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
172
+ ay2 = WSP_GGML_F32_VEC_FMA(ax2, vx, ay2);
173
+
174
+ WSP_GGML_F32_VEC_STORE(y + i + 1*wsp_ggml_f32_epr, ay2);
175
+
176
+ ax3 = WSP_GGML_F32_VEC_LOAD(x + i + 2*wsp_ggml_f32_epr);
177
+ ay3 = WSP_GGML_F32_VEC_LOAD(y + i + 2*wsp_ggml_f32_epr);
178
+ ay3 = WSP_GGML_F32_VEC_FMA(ax3, vx, ay3);
179
+
180
+ WSP_GGML_F32_VEC_STORE(y + i + 2*wsp_ggml_f32_epr, ay3);
181
+
182
+ ax4 = WSP_GGML_F32_VEC_LOAD(x + i + 3*wsp_ggml_f32_epr);
183
+ ay4 = WSP_GGML_F32_VEC_LOAD(y + i + 3*wsp_ggml_f32_epr);
184
+ ay4 = WSP_GGML_F32_VEC_FMA(ax4, vx, ay4);
185
+
186
+ WSP_GGML_F32_VEC_STORE(y + i + 3*wsp_ggml_f32_epr, ay4);
187
+
188
+ ax5 = WSP_GGML_F32_VEC_LOAD(x + i + 4*wsp_ggml_f32_epr);
189
+ ay5 = WSP_GGML_F32_VEC_LOAD(y + i + 4*wsp_ggml_f32_epr);
190
+ ay5 = WSP_GGML_F32_VEC_FMA(ax5, vx, ay5);
191
+
192
+ WSP_GGML_F32_VEC_STORE(y + i + 4*wsp_ggml_f32_epr, ay5);
193
+
194
+ ax6 = WSP_GGML_F32_VEC_LOAD(x + i + 5*wsp_ggml_f32_epr);
195
+ ay6 = WSP_GGML_F32_VEC_LOAD(y + i + 5*wsp_ggml_f32_epr);
196
+ ay6 = WSP_GGML_F32_VEC_FMA(ax6, vx, ay6);
197
+
198
+ WSP_GGML_F32_VEC_STORE(y + i + 5*wsp_ggml_f32_epr, ay6);
199
+
200
+ ax7 = WSP_GGML_F32_VEC_LOAD(x + i + 6*wsp_ggml_f32_epr);
201
+ ay7 = WSP_GGML_F32_VEC_LOAD(y + i + 6*wsp_ggml_f32_epr);
202
+ ay7 = WSP_GGML_F32_VEC_FMA(ax7, vx, ay7);
203
+
204
+ WSP_GGML_F32_VEC_STORE(y + i + 6*wsp_ggml_f32_epr, ay7);
205
+
206
+ ax8 = WSP_GGML_F32_VEC_LOAD(x + i + 7*wsp_ggml_f32_epr);
207
+ ay8 = WSP_GGML_F32_VEC_LOAD(y + i + 7*wsp_ggml_f32_epr);
208
+ ay8 = WSP_GGML_F32_VEC_FMA(ax8, vx, ay8);
209
+
210
+ WSP_GGML_F32_VEC_STORE(y + i + 7*wsp_ggml_f32_epr, ay8);
211
+ }
212
+ // leftovers
213
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, wsp_ggml_f32_step] which is handled in below loop
214
+ const int np2 = (n & ~(wsp_ggml_f32_epr - 1));
215
+ for (int i = np; i < np2; i += wsp_ggml_f32_epr) {
216
+ ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
217
+ ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
218
+ ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
219
+
220
+ WSP_GGML_F32_VEC_STORE(y + i, ay1);
221
+ }
222
+ // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
223
+ if (np2 < n) {
224
+ svbool_t pg =svwhilelt_b32(np2, n);
225
+ ax1 = svld1_f32(pg, x + np2);
226
+ ay1 = svld1_f32(pg, y + np2);
227
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
228
+
229
+ svst1_f32(pg, y + np2, ay1);
230
+ }
231
+ #else
232
+ const int np = (n & ~(WSP_GGML_F32_STEP - 1));
233
+
234
+ WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v);
235
+
236
+ WSP_GGML_F32_VEC ax[WSP_GGML_F32_ARR];
237
+ WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR];
238
+
239
+ for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
240
+ for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
241
+ ax[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR);
242
+ ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR);
243
+ ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], ax[j], vx);
244
+
245
+ WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
246
+ }
247
+ }
248
+
249
+ // leftovers
250
+ for (int i = np; i < n; ++i) {
251
+ y[i] += x[i]*v;
252
+ }
253
+ #endif
254
+ #else
255
+ // scalar
256
+ for (int i = 0; i < n; ++i) {
257
+ y[i] += x[i]*v;
258
+ }
259
+ #endif
260
+ }
261
+
262
+ inline static void wsp_ggml_vec_mad_f16(const int n, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, const wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, const float v) {
263
+ #if defined(WSP_GGML_SIMD)
264
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
265
+
266
+ WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
267
+
268
+ WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
269
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
270
+
271
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
272
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
273
+ ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
274
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
275
+ ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
276
+
277
+ WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
278
+ }
279
+ }
280
+
281
+ // leftovers
282
+ for (int i = np; i < n; ++i) {
283
+ y[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(y[i]) + WSP_GGML_FP16_TO_FP32(x[i])*v);
284
+ }
285
+ #else
286
+ // scalar
287
+ for (int i = 0; i < n; ++i) {
288
+ y[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(y[i]) + WSP_GGML_FP16_TO_FP32(x[i])*v);
289
+ }
290
+ #endif
291
+ }
292
+
293
+ // xs and vs are byte strides of x and v
294
+ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * WSP_GGML_RESTRICT y, const float * WSP_GGML_RESTRICT xv, const float * WSP_GGML_RESTRICT vv) {
295
+
296
+ const float * WSP_GGML_RESTRICT x[WSP_GGML_VEC_MAD_UNROLL];
297
+ const float * WSP_GGML_RESTRICT v[WSP_GGML_VEC_MAD_UNROLL];
298
+
299
+ for (int i = 0; i < WSP_GGML_VEC_MAD_UNROLL; ++i) {
300
+ x[i] = (const float *) ((const char *) xv + i*xs);
301
+ v[i] = (const float *) ((const char *) vv + i*vs);
302
+ }
303
+
304
+ #if defined(WSP_GGML_SIMD)
305
+ #if defined(__ARM_FEATURE_SVE)
306
+ // scalar Route to scalar implementation //TODO: Write SVE code
307
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; ++k) {
308
+ for (int i = 0; i < n; ++i) {
309
+ y[i] += x[k][i]*v[k][0];
310
+ }
311
+ }
312
+ #else
313
+ const int np = (n & ~(WSP_GGML_F32_STEP - 1));
314
+
315
+ WSP_GGML_F32_VEC vx[WSP_GGML_VEC_MAD_UNROLL];
316
+
317
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; ++k) {
318
+ vx[k] = WSP_GGML_F32_VEC_SET1(v[k][0]);
319
+ }
320
+
321
+ WSP_GGML_F32_VEC ax[WSP_GGML_VEC_MAD_UNROLL][WSP_GGML_F32_ARR];
322
+ WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR];
323
+
324
+ for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
325
+ for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
326
+ ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR);
327
+
328
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; ++k) {
329
+ ax[k][j] = WSP_GGML_F32_VEC_LOAD(x[k] + i + j*WSP_GGML_F32_EPR);
330
+ ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
331
+ }
332
+
333
+ WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
334
+ }
335
+ }
336
+
337
+ // leftovers
338
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; ++k) {
339
+ for (int i = np; i < n; ++i) {
340
+ y[i] += x[k][i]*v[k][0];
341
+ }
342
+ }
343
+ #endif
344
+ #else
345
+ // scalar
346
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; ++k) {
347
+ for (int i = 0; i < n; ++i) {
348
+ y[i] += x[k][i]*v[k][0];
349
+ }
350
+ }
351
+ #endif
352
+ }
353
+
354
+ //inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
355
+ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float v) {
356
+ #if defined(WSP_GGML_USE_ACCELERATE)
357
+ vDSP_vsmul(y, 1, &v, y, 1, n);
358
+ #elif defined(WSP_GGML_SIMD)
359
+ #if defined(__ARM_FEATURE_SVE)
360
+ const int sve_register_length = wsp_ggml_cpu_get_sve_cnt() * 8;
361
+ const int wsp_ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
362
+ const int wsp_ggml_f32_step = 2 * wsp_ggml_f32_epr;
363
+
364
+ WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v);
365
+ const int np = (n & ~(wsp_ggml_f32_step - 1));
366
+ svfloat32_t ay1;
367
+ svfloat32_t ay2;
368
+ for (int i = 0; i < np; i += wsp_ggml_f32_step) {
369
+ ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
370
+ ay1 = WSP_GGML_F32_VEC_MUL(ay1, vx);
371
+ WSP_GGML_F32_VEC_STORE(y + i, ay1);
372
+
373
+ ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
374
+ ay2 = WSP_GGML_F32_VEC_MUL(ay2, vx);
375
+ WSP_GGML_F32_VEC_STORE(y + i + 1*wsp_ggml_f32_epr, ay2);
376
+ }
377
+ // leftovers
378
+ // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
379
+ if (np < n) {
380
+ svbool_t pg = svwhilelt_b32(np, n);
381
+ ay1 = svld1_f32(pg, y + np);
382
+ ay1 = svmul_f32_m(pg, ay1, vx);
383
+ svst1_f32(pg, y + np, ay1);
384
+ }
385
+ #else
386
+ const int np = (n & ~(WSP_GGML_F32_STEP - 1));
387
+
388
+ WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v);
389
+
390
+ WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR];
391
+
392
+ for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
393
+ for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
394
+ ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR);
395
+ ay[j] = WSP_GGML_F32_VEC_MUL(ay[j], vx);
396
+
397
+ WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
398
+ }
399
+ }
400
+
401
+ // leftovers
402
+ for (int i = np; i < n; ++i) {
403
+ y[i] *= v;
404
+ }
405
+ #endif
406
+ #else
407
+ // scalar
408
+ for (int i = 0; i < n; ++i) {
409
+ y[i] *= v;
410
+ }
411
+ #endif
412
+ }
413
+
414
+ inline static void wsp_ggml_vec_scale_f16(const int n, wsp_ggml_fp16_t * y, const float v) {
415
+ #if defined(WSP_GGML_SIMD)
416
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
417
+
418
+ WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
419
+
420
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
421
+
422
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
423
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
424
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
425
+ ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
426
+
427
+ WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
428
+ }
429
+ }
430
+
431
+ // leftovers
432
+ for (int i = np; i < n; ++i) {
433
+ y[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(y[i])*v);
434
+ }
435
+ #else
436
+ // scalar
437
+ for (int i = 0; i < n; ++i) {
438
+ y[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(y[i])*v);
439
+ }
440
+ #endif
441
+ }
442
+
443
+ inline static void wsp_ggml_vec_norm_f32 (const int n, float * s, const float * x) { wsp_ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
444
+ inline static void wsp_ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
445
+ inline static void wsp_ggml_vec_sqr_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
446
+ for (int i = 0; i < n; ++i) {
447
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
448
+ y[i] = WSP_GGML_FP32_TO_FP16(v*v);
449
+ }
450
+ }
451
+ inline static void wsp_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
452
+ inline static void wsp_ggml_vec_sqrt_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
453
+ for (int i = 0; i < n; ++i) {
454
+ y[i] = WSP_GGML_FP32_TO_FP16(sqrtf(WSP_GGML_FP16_TO_FP32(x[i])));
455
+ }
456
+ }
457
+ inline static void wsp_ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
458
+ inline static void wsp_ggml_vec_log_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
459
+ for (int i = 0; i < n; ++i) {
460
+ y[i] = WSP_GGML_FP32_TO_FP16(logf(WSP_GGML_FP16_TO_FP32(x[i])));
461
+ }
462
+ }
463
+ inline static void wsp_ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
464
+ inline static void wsp_ggml_vec_sin_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
465
+ for (int i = 0; i < n; ++i) {
466
+ y[i] = WSP_GGML_FP32_TO_FP16(sinf(WSP_GGML_FP16_TO_FP32(x[i])));
467
+ }
468
+ }
469
+ inline static void wsp_ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
470
+ inline static void wsp_ggml_vec_cos_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
471
+ for (int i = 0; i < n; ++i) {
472
+ y[i] = WSP_GGML_FP32_TO_FP16(cosf(WSP_GGML_FP16_TO_FP32(x[i])));
473
+ }
474
+ }
475
+ inline static void wsp_ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
476
+ inline static void wsp_ggml_vec_abs_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
477
+ for (int i = 0; i < n; ++i) {
478
+ y[i] = WSP_GGML_FP32_TO_FP16(fabsf(WSP_GGML_FP16_TO_FP32(x[i])));
479
+ }
480
+ }
481
+ inline static void wsp_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
482
+ inline static void wsp_ggml_vec_sgn_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
483
+ for (int i = 0; i < n; ++i) {
484
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
485
+ y[i] = WSP_GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
486
+ }
487
+ }
488
+ inline static void wsp_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
489
+ inline static void wsp_ggml_vec_step_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
490
+ for (int i = 0; i < n; ++i) {
491
+ y[i] = WSP_GGML_FP32_TO_FP16((WSP_GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
492
+ }
493
+ }
494
+ inline static void wsp_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
495
+ inline static void wsp_ggml_vec_tanh_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
496
+ for (int i = 0; i < n; ++i) {
497
+ y[i] = WSP_GGML_FP32_TO_FP16(tanhf(WSP_GGML_FP16_TO_FP32(x[i])));
498
+ }
499
+ }
500
+ inline static void wsp_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
501
+ inline static void wsp_ggml_vec_elu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
502
+ for (int i = 0; i < n; ++i) {
503
+ y[i] = WSP_GGML_FP32_TO_FP16(expm1f(WSP_GGML_FP16_TO_FP32(x[i])));
504
+ }
505
+ }
506
+ inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
507
+ inline static void wsp_ggml_vec_relu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
508
+ for (int i = 0; i < n; ++i) {
509
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
510
+ y[i] = WSP_GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
511
+ }
512
+ }
513
+ inline static void wsp_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
514
+ inline static void wsp_ggml_vec_leaky_relu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const float ns) {
515
+ for (int i = 0; i < n; ++i) {
516
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
517
+ y[i] = WSP_GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
518
+ }
519
+ }
520
+ inline static void wsp_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
521
+ inline static void wsp_ggml_vec_sigmoid_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
522
+ for (int i = 0; i < n; ++i) {
523
+ y[i] = WSP_GGML_FP32_TO_FP16(1.f / (1.f + expf(-WSP_GGML_FP16_TO_FP32(x[i]))));
524
+ }
525
+ }
526
+ // TODO: optimize performance
527
+ inline static void wsp_ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
528
+ inline static void wsp_ggml_vec_hardswish_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
529
+ for (int i = 0; i < n; ++i) {
530
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
531
+ y[i] = WSP_GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
532
+ }
533
+ }
534
+ inline static void wsp_ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
535
+ inline static void wsp_ggml_vec_hardsigmoid_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
536
+ for (int i = 0; i < n; ++i) {
537
+ y[i] = WSP_GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (WSP_GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
538
+ }
539
+ }
540
+ inline static void wsp_ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
541
+ inline static void wsp_ggml_vec_exp_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
542
+ for (int i = 0; i < n; ++i) {
543
+ y[i] = WSP_GGML_FP32_TO_FP16(expf(WSP_GGML_FP16_TO_FP32(x[i])));
544
+ }
545
+ }
546
+
547
+ static const float GELU_COEF_A = 0.044715f;
548
+ static const float GELU_QUICK_COEF = -1.702f;
549
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
550
+ static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
551
+
552
+ inline static float wsp_ggml_gelu_f32(float x) {
553
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
554
+ }
555
+
556
+ inline static void wsp_ggml_vec_gelu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
557
+ const uint16_t * i16 = (const uint16_t *) x;
558
+ for (int i = 0; i < n; ++i) {
559
+ y[i] = wsp_ggml_table_gelu_f16[i16[i]];
560
+ }
561
+ }
562
+
563
+ inline static void wsp_ggml_vec_gelu_erf_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
564
+ for (int i = 0; i < n; ++i) {
565
+ float xi = WSP_GGML_FP16_TO_FP32(x[i]);
566
+ float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
567
+ y[i] = WSP_GGML_FP32_TO_FP16(res);
568
+ }
569
+ }
570
+
571
+ #ifdef WSP_GGML_GELU_FP16
572
+ inline static void wsp_ggml_vec_gelu_f32(const int n, float * y, const float * x) {
573
+ uint16_t t;
574
+ for (int i = 0; i < n; ++i) {
575
+ if (x[i] <= -10.0f) {
576
+ y[i] = 0.0f;
577
+ } else if (x[i] >= 10.0f) {
578
+ y[i] = x[i];
579
+ } else {
580
+ wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]);
581
+ memcpy(&t, &fp16, sizeof(uint16_t));
582
+ y[i] = WSP_GGML_FP16_TO_FP32(wsp_ggml_table_gelu_f16[t]);
583
+ }
584
+ }
585
+ }
586
+ #else
587
+ inline static void wsp_ggml_vec_gelu_f32(const int n, float * y, const float * x) {
588
+ for (int i = 0; i < n; ++i) {
589
+ y[i] = wsp_ggml_gelu_f32(x[i]);
590
+ }
591
+ }
592
+ #endif
593
+
594
+ inline static void wsp_ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
595
+ for (int i = 0; i < n; ++i) {
596
+ float xi = x[i];
597
+ y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
598
+ }
599
+ }
600
+
601
+ inline static float wsp_ggml_gelu_quick_f32(float x) {
602
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
603
+ }
604
+
605
+ //inline static void wsp_ggml_vec_gelu_quick_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
606
+ // const uint16_t * i16 = (const uint16_t *) x;
607
+ // for (int i = 0; i < n; ++i) {
608
+ // y[i] = wsp_ggml_table_gelu_quick_f16[i16[i]];
609
+ // }
610
+ //}
611
+
612
+ #ifdef WSP_GGML_GELU_QUICK_FP16
613
+ inline static void wsp_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
614
+ uint16_t t;
615
+ for (int i = 0; i < n; ++i) {
616
+ wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]);
617
+ memcpy(&t, &fp16, sizeof(uint16_t));
618
+ y[i] = WSP_GGML_FP16_TO_FP32(wsp_ggml_table_gelu_quick_f16[t]);
619
+ }
620
+ }
621
+ #else
622
+ inline static void wsp_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
623
+ for (int i = 0; i < n; ++i) {
624
+ y[i] = wsp_ggml_gelu_quick_f32(x[i]);
625
+ }
626
+ }
627
+ #endif
628
+
629
+ inline static void wsp_ggml_vec_gelu_quick_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
630
+ for (int i = 0; i < n; ++i) {
631
+ float v = WSP_GGML_FP16_TO_FP32(x[i]);
632
+ y[i] = WSP_GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
633
+ }
634
+ }
635
+
636
+ // Sigmoid Linear Unit (SiLU) function
637
+ inline static float wsp_ggml_silu_f32(float x) {
638
+ return x/(1.0f + expf(-x));
639
+ }
640
+ inline static wsp_ggml_fp16_t wsp_ggml_silu_f16(wsp_ggml_fp16_t x) {
641
+ float v = WSP_GGML_FP16_TO_FP32(x);
642
+ return WSP_GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
643
+ }
644
+
645
+ #if __FINITE_MATH_ONLY__
646
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
647
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
648
+ #endif
649
+
650
+ /* Below function was borrowed from the GitHub repository:
651
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
652
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
653
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
654
+ // Constants
655
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
656
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
657
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
658
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
659
+ const svfloat32_t one = svdup_n_f32(1.0f);
660
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
661
+ const svint32_t inactive2 = svdup_n_s32(0);
662
+
663
+ // Algorithm starts here
664
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
665
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
666
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
667
+
668
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
669
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
670
+
671
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
672
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
673
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
674
+
675
+ // and_(t2.d, t1.d, not_mask17.d)
676
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
677
+ t5 = svsub_f32_m(pg, t1, t5); // z
678
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
679
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
680
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
681
+
682
+ return t0;
683
+ }
684
+ #endif
685
+
686
+ #if defined(__ARM_NEON) && defined(__aarch64__)
687
+
688
+ // adapted from arm limited optimized routine
689
+ // the maximum error is 1.45358 plus 0.5 ulps
690
+ // numbers above 88.38 will flush to infinity
691
+ // numbers beneath -103.97 will flush to zero
692
+ inline static float32x4_t wsp_ggml_v_expf(float32x4_t x) {
693
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
694
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
695
+ const float32x4_t n = vsubq_f32(z, r);
696
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
697
+ vdupq_n_f32(0x1.7f7d1cp-20f));
698
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
699
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
700
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
701
+ const float32x4_t u = vmulq_f32(b, b);
702
+ const float32x4_t j = vfmaq_f32(
703
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
704
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
705
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
706
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
707
+ return vfmaq_f32(k, j, k);
708
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
709
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
710
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
711
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
712
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
713
+ }
714
+
715
+ // computes silu x/(1+exp(-x)) in single precision vector
716
+ inline static float32x4_t wsp_ggml_v_silu(float32x4_t x) {
717
+ const float32x4_t one = vdupq_n_f32(1.0f);
718
+ const float32x4_t zero = vdupq_n_f32(0.0f);
719
+ const float32x4_t neg_x = vsubq_f32(zero, x);
720
+ const float32x4_t exp_neg_x = wsp_ggml_v_expf(neg_x);
721
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
722
+ return vdivq_f32(x, one_plus_exp_neg_x);
723
+ }
724
+
725
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
726
+
727
+ // adapted from arm limited optimized routine
728
+ // the maximum error is 1.45358 plus 0.5 ulps
729
+ // numbers above 88.38 will flush to infinity
730
+ // numbers beneath -103.97 will flush to zero
731
+ inline static __m512 wsp_ggml_v_expf(__m512 x) {
732
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
733
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
734
+ const __m512 n = _mm512_sub_ps(z, r);
735
+ const __m512 b =
736
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
737
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
738
+ const __mmask16 d =
739
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
740
+ const __m512 u = _mm512_mul_ps(b, b);
741
+ const __m512 j = _mm512_fmadd_ps(
742
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
743
+ _mm512_set1_ps(0x1.573e2ep-5f)),
744
+ u,
745
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
746
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
747
+ u,
748
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
749
+ const __m512 res = _mm512_scalef_ps(j, n);
750
+ if (_mm512_kortestz(d, d))
751
+ return res;
752
+ const __m512 zero = _mm512_setzero_ps();
753
+ const __m512 alt = _mm512_mask_blend_ps(
754
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
755
+ return _mm512_mask_blend_ps(d, res, alt);
756
+ }
757
+
758
+ // computes silu x/(1+exp(-x)) in single precision vector
759
+ inline static __m512 wsp_ggml_v_silu(__m512 x) {
760
+ const __m512 one = _mm512_set1_ps(1);
761
+ const __m512 zero = _mm512_setzero_ps();
762
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
763
+ const __m512 exp_neg_x = wsp_ggml_v_expf(neg_x);
764
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
765
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
766
+ }
767
+
768
+ #elif defined(__AVX2__) && defined(__FMA__)
769
+
770
+ // adapted from arm limited optimized routine
771
+ // the maximum error is 1.45358 plus 0.5 ulps
772
+ // numbers above 88.38 will flush to infinity
773
+ // numbers beneath -103.97 will flush to zero
774
+ inline static __m256 wsp_ggml_v_expf(__m256 x) {
775
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
776
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
777
+ const __m256 n = _mm256_sub_ps(z, r);
778
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
779
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
780
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
781
+ const __m256 k = _mm256_castsi256_ps(
782
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
783
+ const __m256i c = _mm256_castps_si256(
784
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
785
+ _mm256_set1_ps(126), _CMP_GT_OQ));
786
+ const __m256 u = _mm256_mul_ps(b, b);
787
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
788
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
789
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
790
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
791
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
792
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
793
+ return _mm256_fmadd_ps(j, k, k);
794
+ const __m256i g = _mm256_and_si256(
795
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
796
+ _mm256_set1_epi32(0x82000000u));
797
+ const __m256 s1 =
798
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
799
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
800
+ const __m256i d = _mm256_castps_si256(
801
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
802
+ _mm256_set1_ps(192), _CMP_GT_OQ));
803
+ return _mm256_or_ps(
804
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
805
+ _mm256_andnot_ps(
806
+ _mm256_castsi256_ps(d),
807
+ _mm256_or_ps(
808
+ _mm256_and_ps(_mm256_castsi256_ps(c),
809
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
810
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
811
+ }
812
+
813
+ // computes silu x/(1+exp(-x)) in single precision vector
814
+ inline static __m256 wsp_ggml_v_silu(__m256 x) {
815
+ const __m256 one = _mm256_set1_ps(1);
816
+ const __m256 zero = _mm256_setzero_ps();
817
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
818
+ const __m256 exp_neg_x = wsp_ggml_v_expf(neg_x);
819
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
820
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
821
+ }
822
+
823
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
824
+
825
+ #if defined(__FMA__)
826
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
827
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
828
+ #else
829
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
830
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
831
+ #endif
832
+
833
+ // adapted from arm limited optimized routine
834
+ // the maximum error is 1.45358 plus 0.5 ulps
835
+ // numbers above 88.38 will flush to infinity
836
+ // numbers beneath -103.97 will flush to zero
837
+ inline static __m128 wsp_ggml_v_expf(__m128 x) {
838
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
839
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
840
+ const __m128 n = _mm_sub_ps(z, r);
841
+ const __m128 b =
842
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
843
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
844
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
845
+ const __m128i c =
846
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
847
+ const __m128 u = _mm_mul_ps(b, b);
848
+ const __m128 j =
849
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
850
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
851
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
852
+ if (!_mm_movemask_epi8(c))
853
+ return MADD128(j, k, k);
854
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
855
+ _mm_set1_epi32(0x82000000u));
856
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
857
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
858
+ const __m128i d =
859
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
860
+ return _mm_or_ps(
861
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
862
+ _mm_andnot_ps(_mm_castsi128_ps(d),
863
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
864
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
865
+ }
866
+
867
+ // computes silu x/(1+exp(-x)) in single precision vector
868
+ inline static __m128 wsp_ggml_v_silu(__m128 x) {
869
+ const __m128 one = _mm_set1_ps(1);
870
+ const __m128 zero = _mm_setzero_ps();
871
+ const __m128 neg_x = _mm_sub_ps(zero, x);
872
+ const __m128 exp_neg_x = wsp_ggml_v_expf(neg_x);
873
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
874
+ return _mm_div_ps(x, one_plus_exp_neg_x);
875
+ }
876
+
877
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
878
+
879
+ inline static void wsp_ggml_vec_silu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
880
+ for (int i = 0; i < n; ++i) {
881
+ y[i] = wsp_ggml_silu_f16(x[i]);
882
+ }
883
+ }
884
+
885
+ inline static float wsp_ggml_silu_backward_f32(float x, float dy) {
886
+ const float s = 1.0f/(1.0f + expf(-x));
887
+ return dy*s*(1.0f + x*(1.0f - s));
888
+ }
889
+
890
+ inline static wsp_ggml_fp16_t wsp_ggml_silu_backward_f16(wsp_ggml_fp16_t x, wsp_ggml_fp16_t dy) {
891
+ const float v = WSP_GGML_FP16_TO_FP32(x);
892
+ const float s = 1.0f/(1.0f + expf(-v));
893
+ return WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
894
+ }
895
+
896
+ inline static void wsp_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
897
+ for (int i = 0; i < n; ++i) {
898
+ dx[i] = wsp_ggml_silu_backward_f32(x[i], dy[i]);
899
+ }
900
+ }
901
+
902
+ inline static void wsp_ggml_vec_silu_backward_f16(const int n, wsp_ggml_fp16_t * dx, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * dy) {
903
+ for (int i = 0; i < n; ++i) {
904
+ dx[i] = wsp_ggml_silu_backward_f16(x[i], dy[i]);
905
+ }
906
+ }
907
+
908
+ inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x) {
909
+ #ifndef WSP_GGML_USE_ACCELERATE
910
+ wsp_ggml_float sum = 0.0;
911
+ for (int i = 0; i < n; ++i) {
912
+ sum += (wsp_ggml_float)x[i];
913
+ }
914
+ *s = (float)sum;
915
+ #else
916
+ vDSP_sve(x, 1, s, n);
917
+ #endif
918
+ }
919
+
920
+ inline static void wsp_ggml_vec_sum_f32_ggf(const int n, wsp_ggml_float * s, const float * x) {
921
+ wsp_ggml_float sum = 0.0;
922
+ for (int i = 0; i < n; ++i) {
923
+ sum += (wsp_ggml_float)x[i];
924
+ }
925
+ *s = sum;
926
+ }
927
+
928
+ inline static void wsp_ggml_vec_sum_f16_ggf(const int n, float * s, const wsp_ggml_fp16_t * x) {
929
+ float sum = 0.0f;
930
+ for (int i = 0; i < n; ++i) {
931
+ sum += WSP_GGML_FP16_TO_FP32(x[i]);
932
+ }
933
+ *s = sum;
934
+ }
935
+
936
+ inline static void wsp_ggml_vec_sum_bf16_ggf(const int n, float * s, const wsp_ggml_bf16_t * x) {
937
+ float sum = 0.0f;
938
+ for (int i = 0; i < n; ++i) {
939
+ sum += WSP_GGML_BF16_TO_FP32(x[i]);
940
+ }
941
+ *s = sum;
942
+ }
943
+
944
+ inline static void wsp_ggml_vec_max_f32(const int n, float * s, const float * x) {
945
+ #ifndef WSP_GGML_USE_ACCELERATE
946
+ float max = -INFINITY;
947
+ for (int i = 0; i < n; ++i) {
948
+ max = MAX(max, x[i]);
949
+ }
950
+ *s = max;
951
+ #else
952
+ vDSP_maxv(x, 1, s, n);
953
+ #endif
954
+ }
955
+
956
+ inline static void wsp_ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
957
+ wsp_ggml_vec_norm_f32(n, s, x);
958
+ *s = 1.f/(*s);
959
+ }
960
+
961
+ inline static void wsp_ggml_vec_argmax_f32(const int n, int * s, const float * x) {
962
+ float max = -INFINITY;
963
+ int idx = 0;
964
+ for (int i = 0; i < n; ++i) {
965
+ max = MAX(max, x[i]);
966
+ if (max == x[i]) { idx = i; }
967
+ }
968
+ *s = idx;
969
+ }
970
+
971
+ #ifdef __cplusplus
972
+ }
973
+ #endif