whisper.rn 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/android/src/main/jni.cpp +12 -3
- package/cpp/ggml-alloc.c +292 -130
- package/cpp/ggml-backend-impl.h +4 -4
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +19 -1
- package/cpp/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +14 -7
- package/cpp/ggml-cpu/ggml-cpu.c +65 -44
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +542 -775
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/unary-ops.cpp +135 -0
- package/cpp/ggml-cpu/unary-ops.h +5 -0
- package/cpp/ggml-cpu/vec.cpp +227 -20
- package/cpp/ggml-cpu/vec.h +407 -56
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +94 -12
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1565 -0
- package/cpp/ggml-metal/ggml-metal-device.h +244 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1325 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +802 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3583 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +88 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +341 -15
- package/cpp/ggml.h +150 -5
- package/cpp/jsi/RNWhisperJSI.cpp +9 -2
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/rn-whisper.h +1 -0
- package/cpp/whisper.cpp +89 -72
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperContext.mm +3 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -4
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +19 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +94 -12
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +150 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +2 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +2 -0
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-cpu/vec.cpp
CHANGED
|
@@ -84,6 +84,22 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
|
|
|
84
84
|
}
|
|
85
85
|
// reduce sum1,sum2 to sum1
|
|
86
86
|
WSP_GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
|
+
#elif defined(__riscv_v_intrinsic)
|
|
88
|
+
int vl = __riscv_vsetvlmax_e32m8();
|
|
89
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
90
|
+
vfloat32m8_t vsum;
|
|
91
|
+
vfloat32m8_t ax;
|
|
92
|
+
vfloat32m8_t ay;
|
|
93
|
+
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
|
94
|
+
for (int i = 0; i < n; i += vl) {
|
|
95
|
+
vl = __riscv_vsetvl_e32m8(n - i);
|
|
96
|
+
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
|
97
|
+
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
|
98
|
+
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
|
99
|
+
}
|
|
100
|
+
vl = __riscv_vsetvlmax_e32m8();
|
|
101
|
+
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
|
102
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
87
103
|
#else
|
|
88
104
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
89
105
|
|
|
@@ -197,38 +213,125 @@ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggm
|
|
|
197
213
|
|
|
198
214
|
wsp_ggml_float sumf = 0.0;
|
|
199
215
|
|
|
200
|
-
#if defined(WSP_GGML_SIMD)
|
|
201
|
-
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
202
216
|
|
|
203
|
-
|
|
217
|
+
#if defined(WSP_GGML_SIMD)
|
|
218
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
219
|
+
const int sve_register_length = svcntb() * 8; //get vector length
|
|
220
|
+
const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
221
|
+
const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
|
|
222
|
+
|
|
223
|
+
const int np= (n & ~(wsp_ggml_f16_step - 1));
|
|
224
|
+
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
|
225
|
+
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
|
226
|
+
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
|
227
|
+
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
|
228
|
+
|
|
229
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
230
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
231
|
+
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
232
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
|
|
233
|
+
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
|
|
234
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
|
235
|
+
|
|
236
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
|
|
237
|
+
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
|
|
238
|
+
sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
|
239
|
+
|
|
240
|
+
ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
|
|
241
|
+
ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
|
|
242
|
+
sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
|
243
|
+
|
|
244
|
+
ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
|
|
245
|
+
ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
|
|
246
|
+
sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
|
247
|
+
|
|
248
|
+
ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
|
|
249
|
+
ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
|
|
250
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
|
251
|
+
|
|
252
|
+
ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
|
|
253
|
+
ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
|
|
254
|
+
sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
|
255
|
+
|
|
256
|
+
ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
|
|
257
|
+
ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
|
|
258
|
+
sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
|
259
|
+
|
|
260
|
+
ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
|
|
261
|
+
ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
|
|
262
|
+
sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
|
263
|
+
}
|
|
204
264
|
|
|
205
|
-
|
|
206
|
-
|
|
265
|
+
const int np2 = (n & ~(wsp_ggml_f16_epr - 1)); // round down to multiple of 8
|
|
266
|
+
for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
|
|
267
|
+
svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
|
|
268
|
+
svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
|
|
269
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, rx, ry);
|
|
270
|
+
}
|
|
207
271
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
272
|
+
if (np2 < n) {
|
|
273
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
274
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
275
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
212
276
|
|
|
213
|
-
|
|
277
|
+
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
|
214
278
|
}
|
|
215
|
-
|
|
279
|
+
WSP_GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
|
280
|
+
#elif defined(__riscv_v_intrinsic)
|
|
281
|
+
#if defined(__riscv_zvfh)
|
|
282
|
+
int vl = __riscv_vsetvlmax_e32m2();
|
|
283
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
284
|
+
vfloat32m2_t vsum;
|
|
285
|
+
vfloat16m1_t ax;
|
|
286
|
+
vfloat16m1_t ay;
|
|
287
|
+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
|
288
|
+
for (int i = 0; i < n; i += vl) {
|
|
289
|
+
vl = __riscv_vsetvl_e16m1(n - i);
|
|
290
|
+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
|
291
|
+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
|
292
|
+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
|
293
|
+
}
|
|
294
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
295
|
+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
|
296
|
+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
|
297
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
298
|
+
#else
|
|
299
|
+
for (int i = 0; i < n; ++i) {
|
|
300
|
+
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
301
|
+
}
|
|
302
|
+
#endif // __riscv_zvfh
|
|
303
|
+
#else
|
|
304
|
+
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
216
305
|
|
|
217
|
-
|
|
218
|
-
WSP_GGML_F16_VEC_REDUCE(sumf, sum);
|
|
306
|
+
WSP_GGML_F16_VEC sum[WSP_GGML_F16_ARR] = { WSP_GGML_F16_VEC_ZERO };
|
|
219
307
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
308
|
+
WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
|
|
309
|
+
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
310
|
+
|
|
311
|
+
for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
|
|
312
|
+
for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
|
|
313
|
+
ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
|
|
314
|
+
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
315
|
+
|
|
316
|
+
sum[j] = WSP_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// reduce sum0..sum3 to sum0
|
|
321
|
+
WSP_GGML_F16_VEC_REDUCE(sumf, sum);
|
|
224
322
|
|
|
225
|
-
|
|
226
|
-
|
|
323
|
+
// leftovers
|
|
324
|
+
for (int i = np; i < n; ++i) {
|
|
325
|
+
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
326
|
+
}
|
|
327
|
+
// if you hit this, you are likely running outside the FP range
|
|
328
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
329
|
+
#endif
|
|
227
330
|
#else
|
|
228
331
|
for (int i = 0; i < n; ++i) {
|
|
229
332
|
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
230
333
|
}
|
|
231
|
-
#endif
|
|
334
|
+
#endif // WSP_GGML_SIMD
|
|
232
335
|
|
|
233
336
|
*s = sumf;
|
|
234
337
|
}
|
|
@@ -247,6 +350,12 @@ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
247
350
|
for (; i + 3 < n; i += 4) {
|
|
248
351
|
_mm_storeu_ps(y + i, wsp_ggml_v_silu(_mm_loadu_ps(x + i)));
|
|
249
352
|
}
|
|
353
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
354
|
+
const int vlen = svcntw();
|
|
355
|
+
for (; i < n; i += vlen) {
|
|
356
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
357
|
+
svst1_f32(pg, y + i, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
|
358
|
+
}
|
|
250
359
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
251
360
|
for (; i + 3 < n; i += 4) {
|
|
252
361
|
vst1q_f32(y + i, wsp_ggml_v_silu(vld1q_f32(x + i)));
|
|
@@ -271,16 +380,96 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
|
|
|
271
380
|
for (; i + 3 < n; i += 4) {
|
|
272
381
|
_mm_storeu_ps(y + i, _mm_mul_ps(wsp_ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
273
382
|
}
|
|
383
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
384
|
+
const int vlen = svcntw();
|
|
385
|
+
for (; i < n; i += vlen) {
|
|
386
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
387
|
+
svst1_f32(pg, y + i, svmul_f32_x(pg, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
|
388
|
+
}
|
|
274
389
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
275
390
|
for (; i + 3 < n; i += 4) {
|
|
276
391
|
vst1q_f32(y + i, vmulq_f32(wsp_ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
277
392
|
}
|
|
393
|
+
#elif defined(__riscv_v_intrinsic)
|
|
394
|
+
for (int vl; i < n; i += vl) {
|
|
395
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
396
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
397
|
+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
|
398
|
+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(wsp_ggml_v_silu_m2(vx, vl), vg, vl);
|
|
399
|
+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
|
400
|
+
}
|
|
278
401
|
#endif
|
|
279
402
|
for (; i < n; ++i) {
|
|
280
403
|
y[i] = wsp_ggml_silu_f32(x[i]) * g[i];
|
|
281
404
|
}
|
|
282
405
|
}
|
|
283
406
|
|
|
407
|
+
wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
|
|
408
|
+
int i = 0;
|
|
409
|
+
wsp_ggml_float sum = 0;
|
|
410
|
+
// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
|
|
411
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
|
|
412
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
|
413
|
+
for (; i + 15 < n; i += 16) {
|
|
414
|
+
__m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
|
|
415
|
+
_mm512_set1_ps(mean));
|
|
416
|
+
_mm512_storeu_ps(y + i, val);
|
|
417
|
+
sum += (wsp_ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
|
|
418
|
+
}
|
|
419
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
|
420
|
+
for (; i + 7 < n; i += 8) {
|
|
421
|
+
__m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
|
|
422
|
+
_mm256_set1_ps(mean));
|
|
423
|
+
_mm256_storeu_ps(y + i, val);
|
|
424
|
+
val = _mm256_mul_ps(val,val);
|
|
425
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
|
426
|
+
_mm256_castps256_ps128(val));
|
|
427
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
|
428
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
|
429
|
+
sum += (wsp_ggml_float)_mm_cvtss_f32(val2);
|
|
430
|
+
}
|
|
431
|
+
#elif defined(__SSE2__)
|
|
432
|
+
for (; i + 3 < n; i += 4) {
|
|
433
|
+
__m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
|
|
434
|
+
_mm_set1_ps(mean));
|
|
435
|
+
_mm_storeu_ps(y + i, val);
|
|
436
|
+
val = _mm_mul_ps(val, val);
|
|
437
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
|
438
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
|
439
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
|
440
|
+
#else
|
|
441
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
|
442
|
+
val = _mm_add_ps(val, tmp);
|
|
443
|
+
tmp = _mm_movehl_ps(tmp, val);
|
|
444
|
+
val = _mm_add_ss(val, tmp);
|
|
445
|
+
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
|
446
|
+
sum += (wsp_ggml_float)_mm_cvtss_f32(val);
|
|
447
|
+
}
|
|
448
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
449
|
+
for (; i + 3 < n; i += 4) {
|
|
450
|
+
float32x4_t val = vsubq_f32(vld1q_f32(x + i),
|
|
451
|
+
vdupq_n_f32(mean));
|
|
452
|
+
vst1q_f32(y + i, val);
|
|
453
|
+
val = vmulq_f32(val, val);
|
|
454
|
+
sum += (wsp_ggml_float)vaddvq_f32(val);
|
|
455
|
+
}
|
|
456
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
457
|
+
for (; i + 3 < n; i += 4) {
|
|
458
|
+
float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
|
|
459
|
+
vec_xst(val, 0, y + i);
|
|
460
|
+
val = vec_mul(val, val);
|
|
461
|
+
sum += (wsp_ggml_float)vec_hsum_f32x4(val);
|
|
462
|
+
}
|
|
463
|
+
#endif
|
|
464
|
+
for (; i < n; ++i) {
|
|
465
|
+
float val = x[i] - mean;
|
|
466
|
+
y[i] = val;
|
|
467
|
+
val *= val;
|
|
468
|
+
sum += (wsp_ggml_float)val;
|
|
469
|
+
}
|
|
470
|
+
return sum/n;
|
|
471
|
+
}
|
|
472
|
+
|
|
284
473
|
wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
|
285
474
|
int i = 0;
|
|
286
475
|
wsp_ggml_float sum = 0;
|
|
@@ -318,6 +507,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
|
|
|
318
507
|
#endif
|
|
319
508
|
sum += (wsp_ggml_float)_mm_cvtss_f32(val);
|
|
320
509
|
}
|
|
510
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
511
|
+
const int vlen = svcntw();
|
|
512
|
+
for (; i < n; i += vlen) {
|
|
513
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
514
|
+
svfloat32_t val = wsp_ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
|
515
|
+
svdup_n_f32_x(pg, max)));
|
|
516
|
+
svst1_f32(pg, y + i, val);
|
|
517
|
+
sum += (wsp_ggml_float)svaddv_f32(pg, val);
|
|
518
|
+
}
|
|
321
519
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
322
520
|
for (; i + 3 < n; i += 4) {
|
|
323
521
|
float32x4_t val = wsp_ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
|
@@ -325,6 +523,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
|
|
|
325
523
|
vst1q_f32(y + i, val);
|
|
326
524
|
sum += (wsp_ggml_float)vaddvq_f32(val);
|
|
327
525
|
}
|
|
526
|
+
#elif defined(__riscv_v_intrinsic)
|
|
527
|
+
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
|
528
|
+
for (int avl; i < n; i += avl) {
|
|
529
|
+
avl = __riscv_vsetvl_e32m2(n - i);
|
|
530
|
+
vfloat32m2_t val = wsp_ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
|
|
531
|
+
__riscv_vse32_v_f32m2(&y[i], val, avl);
|
|
532
|
+
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
|
|
533
|
+
}
|
|
534
|
+
return (wsp_ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
|
328
535
|
#endif
|
|
329
536
|
for (; i < n; ++i) {
|
|
330
537
|
float val = expf(x[i] - max);
|