whisper.rn 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/android/src/main/jni.cpp +12 -3
  4. package/cpp/ggml-alloc.c +292 -130
  5. package/cpp/ggml-backend-impl.h +4 -4
  6. package/cpp/ggml-backend-reg.cpp +13 -5
  7. package/cpp/ggml-backend.cpp +207 -17
  8. package/cpp/ggml-backend.h +19 -1
  9. package/cpp/ggml-cpu/amx/amx.cpp +5 -2
  10. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  11. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  12. package/cpp/ggml-cpu/common.h +14 -0
  13. package/cpp/ggml-cpu/ggml-cpu-impl.h +14 -7
  14. package/cpp/ggml-cpu/ggml-cpu.c +65 -44
  15. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  16. package/cpp/ggml-cpu/ops.cpp +542 -775
  17. package/cpp/ggml-cpu/ops.h +2 -0
  18. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  19. package/cpp/ggml-cpu/unary-ops.cpp +135 -0
  20. package/cpp/ggml-cpu/unary-ops.h +5 -0
  21. package/cpp/ggml-cpu/vec.cpp +227 -20
  22. package/cpp/ggml-cpu/vec.h +407 -56
  23. package/cpp/ggml-cpu.h +1 -1
  24. package/cpp/ggml-impl.h +94 -12
  25. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  26. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  27. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  28. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  29. package/cpp/ggml-metal/ggml-metal-device.cpp +1565 -0
  30. package/cpp/ggml-metal/ggml-metal-device.h +244 -0
  31. package/cpp/ggml-metal/ggml-metal-device.m +1325 -0
  32. package/cpp/ggml-metal/ggml-metal-impl.h +802 -0
  33. package/cpp/ggml-metal/ggml-metal-ops.cpp +3583 -0
  34. package/cpp/ggml-metal/ggml-metal-ops.h +88 -0
  35. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  36. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  37. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  38. package/cpp/ggml-metal-impl.h +40 -40
  39. package/cpp/ggml-metal.h +1 -6
  40. package/cpp/ggml-quants.c +1 -0
  41. package/cpp/ggml.c +341 -15
  42. package/cpp/ggml.h +150 -5
  43. package/cpp/jsi/RNWhisperJSI.cpp +9 -2
  44. package/cpp/jsi/ThreadPool.h +3 -3
  45. package/cpp/rn-whisper.h +1 -0
  46. package/cpp/whisper.cpp +89 -72
  47. package/cpp/whisper.h +1 -0
  48. package/ios/CMakeLists.txt +6 -1
  49. package/ios/RNWhisperContext.mm +3 -1
  50. package/ios/RNWhisperVadContext.mm +14 -13
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  57. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
  58. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  59. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  60. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  61. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  62. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  69. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
  70. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  71. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  72. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  73. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  74. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  75. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  80. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  81. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  82. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
  83. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  84. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  85. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  86. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  87. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
  92. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  93. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  94. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
  95. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
  96. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  97. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  98. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  99. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  100. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  101. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  102. package/lib/commonjs/version.json +1 -1
  103. package/lib/module/NativeRNWhisper.js.map +1 -1
  104. package/lib/module/version.json +1 -1
  105. package/lib/typescript/NativeRNWhisper.d.ts +2 -0
  106. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  107. package/package.json +1 -1
  108. package/src/NativeRNWhisper.ts +2 -0
  109. package/src/version.json +1 -1
  110. package/whisper-rn.podspec +8 -9
  111. package/cpp/ggml-metal.m +0 -6779
  112. package/cpp/ggml-whisper-sim.metallib +0 -0
  113. package/cpp/ggml-whisper.metallib +0 -0
@@ -44,6 +44,7 @@ void wsp_ggml_vec_dot_bf16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_gg
44
44
  void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
45
45
 
46
46
  void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x);
47
+ wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
47
48
  wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
48
49
  wsp_ggml_float wsp_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
49
50
 
@@ -119,36 +120,149 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
119
120
  }
120
121
 
121
122
  #if defined(WSP_GGML_SIMD)
122
- const int np = (n & ~(WSP_GGML_F16_STEP - 1));
123
+ #if defined(__ARM_FEATURE_SVE)
124
+
125
+ const int sve_register_length = svcntb() * 8;
126
+ const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
127
+ const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
128
+
129
+ const int np = (n & ~(wsp_ggml_f16_step - 1));
130
+
131
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
132
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
133
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
134
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
135
+
136
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
137
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
138
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
139
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
140
+
141
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
+
144
+ for (int i = 0; i < np; i += wsp_ggml_f16_step) {
145
+ ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
146
+
147
+ ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
148
+ sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
149
+ ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
150
+ sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
+
152
+ ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
153
+
154
+ ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 elements
155
+ sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
+ ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
157
+ sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
+
159
+ ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
160
+
161
+ ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
162
+ sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
+ ax3 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
164
+ sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
+
166
+ ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
167
+
168
+ ax4 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 3*wsp_ggml_f16_epr, 3);
169
+ sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170
+ ax4 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 3*wsp_ggml_f16_epr, 3);
171
+ sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
+
173
+ ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
174
+
175
+ ax5 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 4*wsp_ggml_f16_epr, 4);
176
+
177
+ sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178
+ ax5 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 4*wsp_ggml_f16_epr, 4);
179
+ sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
+
181
+ ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
182
+
183
+ ax6 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 5*wsp_ggml_f16_epr, 5);
184
+
185
+ sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186
+ ax6 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 5*wsp_ggml_f16_epr, 5);
187
+ sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
+
189
+ ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
190
+
191
+ ax7 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 6*wsp_ggml_f16_epr, 6);
192
+
193
+ sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194
+ ax7 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 6*wsp_ggml_f16_epr, 6);
195
+ sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
+
197
+ ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
198
+
199
+ ax8 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 7*wsp_ggml_f16_epr, 7);
200
+
201
+ sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202
+ ax8 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 7*wsp_ggml_f16_epr, 7);
203
+ sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204
+ }
205
+
206
+ const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
207
+ for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
208
+ svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
209
+
210
+ svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x[0] + k, 0);
211
+ sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, rx, ry);
212
+ rx = WSP_GGML_F16x_VEC_LOAD(x[1] + k, 0);
213
+ sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, rx, ry);
214
+ }
215
+
216
+ if (np2 < n) {
217
+ svbool_t pg = svwhilelt_b16(np2, n);
218
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
219
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
220
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
+
222
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224
+ }
225
+ WSP_GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
+ WSP_GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
+ #elif defined(__riscv_v_intrinsic)
228
+ // todo: RVV impl
229
+ for (int i = 0; i < n; ++i) {
230
+ for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
231
+ sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
232
+ }
233
+ }
234
+ #else
235
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
123
236
 
124
- WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
237
+ WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
125
238
 
126
- WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
127
- WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
239
+ WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
240
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
128
241
 
129
- for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
130
- for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
131
- ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
242
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
243
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
244
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
132
245
 
133
- for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
134
- ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
246
+ for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
247
+ ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
135
248
 
136
- sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
249
+ sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
250
+ }
137
251
  }
138
252
  }
139
- }
140
253
 
141
- // reduce sum0..sum3 to sum0
142
- for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
143
- WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
144
- }
254
+ // reduce sum0..sum3 to sum0
255
+ for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
256
+ WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
257
+ }
145
258
 
146
- // leftovers
147
- for (int i = np; i < n; ++i) {
148
- for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
149
- sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
259
+ // leftovers
260
+ for (int i = np; i < n; ++i) {
261
+ for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
262
+ sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
263
+ }
150
264
  }
151
- }
265
+ #endif
152
266
  #else
153
267
  for (int i = 0; i < n; ++i) {
154
268
  for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
@@ -243,6 +357,14 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
243
357
 
244
358
  svst1_f32(pg, y + np2, ay1);
245
359
  }
360
+ #elif defined(__riscv_v_intrinsic)
361
+ for (int i = 0, avl; i < n; i += avl) {
362
+ avl = __riscv_vsetvl_e32m8(n - i);
363
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
364
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
365
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
366
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
367
+ }
246
368
  #else
247
369
  const int np = (n & ~(WSP_GGML_F32_STEP - 1));
248
370
 
@@ -276,27 +398,112 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
276
398
 
277
399
  inline static void wsp_ggml_vec_mad_f16(const int n, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, const wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, const float v) {
278
400
  #if defined(WSP_GGML_SIMD)
279
- const int np = (n & ~(WSP_GGML_F16_STEP - 1));
401
+ #if defined(__ARM_FEATURE_SVE)
402
+ const int sve_register_length = svcntb() * 8;
403
+ const int wsp_ggml_f16_epr = sve_register_length / 16;
404
+ const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr;
405
+
406
+ WSP_GGML_F16x_VEC vx = WSP_GGML_F16x_VEC_SET1(v);
407
+
408
+ const int np= (n & ~(wsp_ggml_f16_step - 1));
409
+
410
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
411
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
412
+ for (int i = 0; i < np; i += wsp_ggml_f16_step) {
413
+ ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
414
+ ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
415
+ ay1 = WSP_GGML_F16x_VEC_FMA(ay1, ax1, vx);
416
+
417
+ WSP_GGML_F16x_VEC_STORE(y + i + 0 * wsp_ggml_f16_epr, ay1, 0);
418
+
419
+ ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
420
+ ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
421
+ ay2 = WSP_GGML_F16x_VEC_FMA(ay2, ax2, vx);
422
+
423
+ WSP_GGML_F16x_VEC_STORE(y + i + 1 * wsp_ggml_f16_epr, ay2, 1);
424
+
425
+ ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
426
+ ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
427
+ ay3 = WSP_GGML_F16x_VEC_FMA(ay3, ax3, vx);
428
+
429
+ WSP_GGML_F16x_VEC_STORE(y + i + 2 * wsp_ggml_f16_epr, ay3, 2);
430
+
431
+ ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
432
+ ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
433
+ ay4 = WSP_GGML_F16x_VEC_FMA(ay4, ax4, vx);
434
+
435
+ WSP_GGML_F16x_VEC_STORE(y + i + 3 * wsp_ggml_f16_epr, ay4, 3);
436
+
437
+ ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
438
+ ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
439
+ ay5 = WSP_GGML_F16x_VEC_FMA(ay5, ax5, vx);
440
+
441
+ WSP_GGML_F16x_VEC_STORE(y + i + 4 * wsp_ggml_f16_epr, ay5, 4);
442
+
443
+ ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
444
+ ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
445
+ ay6 = WSP_GGML_F16x_VEC_FMA(ay6, ax6, vx);
446
+
447
+ WSP_GGML_F16x_VEC_STORE(y + i + 5 * wsp_ggml_f16_epr, ay6, 5);
280
448
 
281
- WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
449
+ ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
450
+ ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
451
+ ay7 = WSP_GGML_F16x_VEC_FMA(ay7, ax7, vx);
282
452
 
283
- WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
284
- WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
453
+ WSP_GGML_F16x_VEC_STORE(y + i + 6 * wsp_ggml_f16_epr, ay7, 6);
285
454
 
286
- for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
287
- for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
288
- ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
289
- ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
290
- ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
455
+ ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
456
+ ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
457
+ ay8 = WSP_GGML_F16x_VEC_FMA(ay8, ax8, vx);
291
458
 
292
- WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
459
+ WSP_GGML_F16x_VEC_STORE(y + i + 7 * wsp_ggml_f16_epr, ay8, 7);
293
460
  }
294
- }
461
+ const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
462
+ for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
463
+ svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
464
+ svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
465
+ ry = WSP_GGML_F16x_VEC_FMA(ry, rx, vx);
295
466
 
296
- // leftovers
297
- for (int i = np; i < n; ++i) {
298
- y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
299
- }
467
+ WSP_GGML_F16x_VEC_STORE(y + k, ry, 0);
468
+ }
469
+
470
+ if (np2 < n) {
471
+ svbool_t pg = svwhilelt_b16(np2, n);
472
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
473
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
474
+ hy = svmad_f16_x(pg, hx, vx, hy);
475
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
476
+ }
477
+
478
+ #elif defined(__riscv_v_intrinsic)
479
+ // todo: RVV impl
480
+ // scalar
481
+ for (int i = 0; i < n; ++i) {
482
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
483
+ }
484
+ #else
485
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
486
+
487
+ WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
488
+
489
+ WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
490
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
491
+
492
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
493
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
494
+ ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
495
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
496
+ ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
497
+
498
+ WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
499
+ }
500
+ }
501
+
502
+ // leftovers
503
+ for (int i = np; i < n; ++i) {
504
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
505
+ }
506
+ #endif
300
507
  #else
301
508
  // scalar
302
509
  for (int i = 0; i < n; ++i) {
@@ -324,6 +531,16 @@ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const
324
531
  y[i] += x[k][i]*v[k][0];
325
532
  }
326
533
  }
534
+ #elif defined(__riscv_v_intrinsic)
535
+ for (int i = 0, avl; i < n; i += avl) {
536
+ avl = __riscv_vsetvl_e32m8(n - i);
537
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
538
+ for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; k++) {
539
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
540
+ ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
541
+ }
542
+ __riscv_vse32_v_f32m8(&y[i], ay, avl);
543
+ }
327
544
  #else
328
545
  const int np = (n & ~(WSP_GGML_F32_STEP - 1));
329
546
 
@@ -375,6 +592,14 @@ inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x
375
592
  for (int i = 0; i < n; ++i) {
376
593
  y[i] = x[i]*s + b;
377
594
  }
595
+ #elif defined(__riscv_v_intrinsic)
596
+ for (int i = 0, avl; i < n; i += avl) {
597
+ avl = __riscv_vsetvl_e32m8(n - i);
598
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
599
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
600
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
601
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
602
+ }
378
603
  #else
379
604
  const int np = (n & ~(WSP_GGML_F32_STEP - 1));
380
605
 
@@ -386,7 +611,7 @@ inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x
386
611
  for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
387
612
  for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
388
613
  ay[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR);
389
- ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], vs, vb);
614
+ ay[j] = WSP_GGML_F32_VEC_FMA(vb, ay[j], vs);
390
615
 
391
616
  WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
392
617
  }
@@ -430,11 +655,18 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
430
655
  }
431
656
  // leftovers
432
657
  // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
433
- if (np < n) {
434
- svbool_t pg = svwhilelt_b32(np, n);
435
- ay1 = svld1_f32(pg, y + np);
658
+ for (int i = np; i < n; i += wsp_ggml_f32_epr) {
659
+ svbool_t pg = svwhilelt_b32(i, n);
660
+ ay1 = svld1_f32(pg, y + i);
436
661
  ay1 = svmul_f32_m(pg, ay1, vx);
437
- svst1_f32(pg, y + np, ay1);
662
+ svst1_f32(pg, y + i, ay1);
663
+ }
664
+ #elif defined(__riscv_v_intrinsic)
665
+ for (int i = 0, avl; i < n; i += avl) {
666
+ avl = __riscv_vsetvl_e32m8(n - i);
667
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
668
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
669
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
438
670
  }
439
671
  #else
440
672
  const int np = (n & ~(WSP_GGML_F32_STEP - 1));
@@ -467,25 +699,59 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
467
699
 
468
700
  inline static void wsp_ggml_vec_scale_f16(const int n, wsp_ggml_fp16_t * y, const float v) {
469
701
  #if defined(WSP_GGML_SIMD)
470
- const int np = (n & ~(WSP_GGML_F16_STEP - 1));
702
+ #if defined(__ARM_FEATURE_SVE)
703
+ const int sve_register_length = svcntb() * 8;
704
+ const int wsp_ggml_f16_epr = sve_register_length / 16;
705
+ const int wsp_ggml_f16_step = 2 * wsp_ggml_f16_epr;
706
+
707
+ WSP_GGML_F16x_VEC vx = WSP_GGML_F16x_VEC_SET1(v);
708
+ const int np = (n & ~(wsp_ggml_f16_step - 1));
709
+ svfloat16_t ay1, ay2;
710
+
711
+ for (int i = 0; i < np; i += wsp_ggml_f16_step) {
712
+ ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0*wsp_ggml_f16_epr, 0);
713
+ ay1 = WSP_GGML_F16x_VEC_MUL(ay1, vx);
714
+ WSP_GGML_F16x_VEC_STORE(y + i + 0*wsp_ggml_f16_epr, ay1, 0);
715
+
716
+ ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1*wsp_ggml_f16_epr, 1);
717
+ ay2 = WSP_GGML_F16x_VEC_MUL(ay2, vx);
718
+ WSP_GGML_F16x_VEC_STORE(y + i + 1*wsp_ggml_f16_epr, ay2, 1);
719
+ }
720
+ // leftovers
721
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722
+ if (np < n) {
723
+ svbool_t pg = svwhilelt_b16(np, n);
724
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
725
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
726
+ svst1_f16(pg, (__fp16 *)(y + np), out);
727
+ }
728
+ #elif defined(__riscv_v_intrinsic)
729
+ // todo: RVV impl
730
+ // scalar
731
+ for (int i = 0; i < n; ++i) {
732
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
733
+ }
734
+ #else
735
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
471
736
 
472
- WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
737
+ WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
473
738
 
474
- WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
739
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
475
740
 
476
- for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
477
- for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
478
- ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
479
- ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
741
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
742
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
743
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
744
+ ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
480
745
 
481
- WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
746
+ WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
747
+ }
482
748
  }
483
- }
484
749
 
485
- // leftovers
486
- for (int i = np; i < n; ++i) {
487
- y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
488
- }
750
+ // leftovers
751
+ for (int i = np; i < n; ++i) {
752
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
753
+ }
754
+ #endif
489
755
  #else
490
756
  // scalar
491
757
  for (int i = 0; i < n; ++i) {
@@ -554,7 +820,8 @@ inline static void wsp_ggml_vec_tanh_f16 (const int n, wsp_ggml_fp16_t * y, cons
554
820
  inline static void wsp_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
555
821
  inline static void wsp_ggml_vec_elu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
556
822
  for (int i = 0; i < n; ++i) {
557
- y[i] = WSP_GGML_CPU_FP32_TO_FP16(expm1f(WSP_GGML_CPU_FP16_TO_FP32(x[i])));
823
+ const float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
824
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
558
825
  }
559
826
  }
560
827
  inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
@@ -737,7 +1004,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
737
1004
  }
738
1005
  #endif
739
1006
 
740
- #if defined(__ARM_NEON) && defined(__aarch64__)
1007
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1008
+
1009
+ inline static svfloat32_t wsp_ggml_v_expf(svbool_t pg, svfloat32_t x) {
1010
+ const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1011
+ const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1012
+ const svfloat32_t n = svsub_f32_x(pg, z, r);
1013
+ const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1014
+ const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1015
+ const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1016
+ const svbool_t c = svacgt_n_f32(pg, n, 126);
1017
+ const svfloat32_t u = svmul_f32_x(pg, b, b);
1018
+ const svfloat32_t j = svmla_f32_x(pg,
1019
+ svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1020
+ svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1021
+ svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1022
+ const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1023
+ const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1024
+ const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1025
+ return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1026
+ svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1027
+ }
1028
+
1029
+ // computes silu x/(1+exp(-x)) in single precision vector
1030
+ inline static svfloat32_t wsp_ggml_v_silu(svbool_t pg, svfloat32_t x) {
1031
+ const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1032
+ const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1033
+ const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1034
+ const svfloat32_t exp_neg_x = wsp_ggml_v_expf(pg, neg_x);
1035
+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1036
+ return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1037
+ }
1038
+
1039
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
741
1040
 
742
1041
  // adapted from arm limited optimized routine
743
1042
  // the maximum error is 1.45358 plus 0.5 ulps
@@ -928,7 +1227,59 @@ inline static __m128 wsp_ggml_v_silu(__m128 x) {
928
1227
  return _mm_div_ps(x, one_plus_exp_neg_x);
929
1228
  }
930
1229
 
931
- #endif // __ARM_NEON / __AVX2__ / __SSE2__
1230
+ #elif defined(__riscv_v_intrinsic)
1231
+
1232
+ // adapted from arm limited optimized routine
1233
+ // the maximum error is 1.45358 plus 0.5 ulps
1234
+ // numbers above 88.38 will flush to infinity
1235
+ // numbers beneath -103.97 will flush to zero
1236
+ inline static vfloat32m2_t wsp_ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1237
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1238
+ #ifdef __riscv_xtheadvector
1239
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1240
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1241
+ z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1242
+ #else
1243
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1244
+ #endif
1245
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1246
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1247
+ 0x1.7f7d1cp-20f, n, vl);
1248
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1249
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1250
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1251
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1252
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1253
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1254
+ __riscv_vfmacc_vv_f32m2(
1255
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1256
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1257
+ u, vl), u, vl);
1258
+ if (!__riscv_vcpop_m_b16(c, vl))
1259
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1260
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1261
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1262
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1263
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1264
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1265
+ __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1266
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1267
+ c, vl);
1268
+ return __riscv_vmerge_vvm_f32m2(
1269
+ r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1270
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1271
+ vl);
1272
+ }
1273
+
1274
+ // computes silu x/(1+exp(-x)) in single precision vector
1275
+ inline static vfloat32m2_t wsp_ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1276
+ const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1277
+ const vfloat32m2_t exp_neg_x = wsp_ggml_v_expf_m2(neg_x, vl);
1278
+ const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1279
+ return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1280
+ }
1281
+
1282
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
932
1283
 
933
1284
  inline static void wsp_ggml_vec_silu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
934
1285
  for (int i = 0; i < n; ++i) {
package/cpp/ggml-cpu.h CHANGED
@@ -101,7 +101,6 @@ extern "C" {
101
101
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v (void);
102
102
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx (void);
103
103
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe (void);
104
- WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa (void);
105
104
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd (void);
106
105
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile (void);
107
106
 
@@ -135,6 +134,7 @@ extern "C" {
135
134
  WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
136
135
 
137
136
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137
+ WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
138
138
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
139
139
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
140
140
  WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);