whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +264 -126
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +13 -5
  6. package/cpp/ggml-backend.cpp +207 -17
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  9. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  10. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  11. package/cpp/ggml-cpu/common.h +14 -0
  12. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  13. package/cpp/ggml-cpu/ggml-cpu.c +48 -41
  14. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  15. package/cpp/ggml-cpu/ops.cpp +518 -767
  16. package/cpp/ggml-cpu/ops.h +2 -0
  17. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  18. package/cpp/ggml-cpu/vec.cpp +161 -20
  19. package/cpp/ggml-cpu/vec.h +400 -51
  20. package/cpp/ggml-cpu.h +1 -1
  21. package/cpp/ggml-impl.h +43 -10
  22. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  23. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  24. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  25. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  27. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  28. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  29. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  31. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  33. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  34. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  35. package/cpp/ggml-metal-impl.h +40 -40
  36. package/cpp/ggml-metal.h +1 -6
  37. package/cpp/ggml-quants.c +1 -0
  38. package/cpp/ggml.c +175 -13
  39. package/cpp/ggml.h +84 -5
  40. package/cpp/jsi/RNWhisperJSI.cpp +2 -0
  41. package/cpp/jsi/ThreadPool.h +3 -3
  42. package/cpp/whisper.cpp +85 -70
  43. package/cpp/whisper.h +1 -0
  44. package/ios/CMakeLists.txt +6 -1
  45. package/ios/RNWhisperVadContext.mm +14 -13
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  50. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  84. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  85. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  86. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  87. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  92. package/lib/commonjs/version.json +1 -1
  93. package/lib/module/version.json +1 -1
  94. package/package.json +1 -1
  95. package/src/version.json +1 -1
  96. package/whisper-rn.podspec +8 -9
  97. package/cpp/ggml-metal.m +0 -6779
  98. package/cpp/ggml-whisper-sim.metallib +0 -0
  99. package/cpp/ggml-whisper.metallib +0 -0
@@ -69,7 +69,9 @@ void wsp_ggml_compute_forward_clamp(const struct wsp_ggml_compute_params * param
69
69
  void wsp_ggml_compute_forward_conv_transpose_1d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
70
70
  void wsp_ggml_compute_forward_im2col(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
71
71
  void wsp_ggml_compute_forward_im2col_back_f32(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
72
+ void wsp_ggml_compute_forward_im2col_3d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
72
73
  void wsp_ggml_compute_forward_conv_2d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
74
+ void wsp_ggml_compute_forward_conv_3d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
73
75
  void wsp_ggml_compute_forward_conv_transpose_2d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
74
76
  void wsp_ggml_compute_forward_conv_2d_dw(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
75
77
  void wsp_ggml_compute_forward_pool_1d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
@@ -18,6 +18,10 @@
18
18
  #include <immintrin.h>
19
19
  #endif
20
20
 
21
+ #if defined(__riscv_v_intrinsic)
22
+ #include <riscv_vector.h>
23
+ #endif
24
+
21
25
  #ifdef __cplusplus
22
26
  extern "C" {
23
27
  #endif
@@ -94,24 +98,15 @@ extern "C" {
94
98
  }
95
99
  #elif defined(__riscv) && defined(__riscv_zfhmin)
96
100
  static inline float riscv_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
97
- float f;
98
- __asm__(
99
- "fmv.h.x %[f], %[h]\n\t"
100
- "fcvt.s.h %[f], %[f]"
101
- : [f] "=&f" (f)
102
- : [h] "r" (h)
103
- );
104
- return f;
101
+ _Float16 hf;
102
+ memcpy(&hf, &h, sizeof(wsp_ggml_fp16_t));
103
+ return hf;
105
104
  }
106
105
 
107
106
  static inline wsp_ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
107
  wsp_ggml_fp16_t res;
109
- __asm__(
110
- "fcvt.h.s %[f], %[f]\n\t"
111
- "fmv.x.h %[h], %[f]"
112
- : [h] "=&r" (res)
113
- : [f] "f" (f)
114
- );
108
+ _Float16 hf = (_Float16)f;
109
+ memcpy(&res, &hf, sizeof(wsp_ggml_fp16_t));
115
110
  return res;
116
111
  }
117
112
 
@@ -119,26 +114,6 @@ extern "C" {
119
114
  #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
120
115
  #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
121
116
  #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
122
- #elif defined(__NNPA__)
123
- #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
124
- #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
125
-
126
- #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
127
- #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
128
-
129
- static inline float nnpa_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
130
- uint16x8_t v_h = vec_splats(h);
131
- uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
132
- return vec_extend_to_fp32_hi(v_hd, 0)[0];
133
- }
134
-
135
- static inline wsp_ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
136
- float32x4_t v_f = vec_splats(f);
137
- float32x4_t v_zero = vec_splats(0.0f);
138
- uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
139
- uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
140
- return vec_extract(v_h, 0);
141
- }
142
117
  #endif
143
118
 
144
119
  // precomputed f32 table for f16 (256 KB)
@@ -220,6 +195,47 @@ inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) {
220
195
  #define WSP_GGML_F32_VEC_MUL WSP_GGML_F32xt_MUL
221
196
  #define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32xt_REDUCE
222
197
 
198
+ // F16 SVE
199
+ #define DEFAULT_PG32 svptrue_b32()
200
+ #define DEFAULT_PG16 svptrue_b16()
201
+
202
+ #define WSP_GGML_F32Cxt svfloat16_t
203
+ #define WSP_GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
204
+ #define WSP_GGML_F32Cxt_SET1(x) svdup_n_f16(x)
205
+ #define WSP_GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
206
+ #define WSP_GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
207
+
208
+ #define WSP_GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
209
+ #define WSP_GGML_F32Cxt_FMA(...) WSP_GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
210
+ #define WSP_GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
211
+ #define WSP_GGML_F32Cxt_ADD(...) WSP_GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
212
+ #define WSP_GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
213
+ #define WSP_GGML_F32Cxt_MUL(...) WSP_GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
214
+ #define WSP_GGML_F32Cxt_REDUCE WSP_GGML_F16xt_REDUCE_MIXED
215
+
216
+ #define WSP_GGML_F16x_VEC WSP_GGML_F32Cxt
217
+ #define WSP_GGML_F16x_VEC_ZERO WSP_GGML_F32Cxt_ZERO
218
+ #define WSP_GGML_F16x_VEC_SET1 WSP_GGML_F32Cxt_SET1
219
+ #define WSP_GGML_F16x_VEC_LOAD(p, i) WSP_GGML_F32Cxt_LOAD(p)
220
+ #define WSP_GGML_F16x_VEC_STORE(p, r, i) WSP_GGML_F32Cxt_STORE((__fp16 *)(p), r)
221
+ #define WSP_GGML_F16x_VEC_FMA WSP_GGML_F32Cxt_FMA
222
+ #define WSP_GGML_F16x_VEC_ADD WSP_GGML_F32Cxt_ADD
223
+ #define WSP_GGML_F16x_VEC_MUL WSP_GGML_F32Cxt_MUL
224
+ #define WSP_GGML_F16x_VEC_REDUCE WSP_GGML_F32Cxt_REDUCE
225
+
226
+ #define WSP_GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
227
+ #define WSP_GGML_F16xt_REDUCE_ONE(...) WSP_GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
228
+
229
+ #define WSP_GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
230
+ { \
231
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
232
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
233
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
234
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
235
+ (res) = (wsp_ggml_float) sum_f16; \
236
+ }
237
+ #define WSP_GGML_F16xt_REDUCE_MIXED(...) WSP_GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
238
+
223
239
  // F16 NEON
224
240
 
225
241
  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -982,9 +998,9 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
982
998
  #define WSP_GGML_F32_EPR 4
983
999
 
984
1000
  #define WSP_GGML_F32x4 __m128
985
- #define WSP_GGML_F32x4_ZERO __lsx_vldi(0)
986
- #define WSP_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
987
- #define WSP_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
1001
+ #define WSP_GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002
+ #define WSP_GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1003
+ #define WSP_GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
988
1004
  #define WSP_GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
989
1005
  #define WSP_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
990
1006
  #define WSP_GGML_F32x4_ADD __lsx_vfadd_s
@@ -1006,7 +1022,7 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
1006
1022
  __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1007
1023
  tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1008
1024
  tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1009
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1025
+ const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
1010
1026
  tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1011
1027
  tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1012
1028
  tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
@@ -1036,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const wsp_ggml_fp16_t * x) {
1036
1052
  tmp[2] = WSP_GGML_CPU_FP16_TO_FP32(x[2]);
1037
1053
  tmp[3] = WSP_GGML_CPU_FP16_TO_FP32(x[3]);
1038
1054
 
1039
- return __lsx_vld(tmp, 0);
1055
+ return (__m128)__lsx_vld(tmp, 0);
1040
1056
  }
1041
1057
 
1042
1058
  static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
@@ -1051,9 +1067,9 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
1051
1067
  }
1052
1068
 
1053
1069
  #define WSP_GGML_F32Cx4 __m128
1054
- #define WSP_GGML_F32Cx4_ZERO __lsx_vldi(0)
1055
- #define WSP_GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1056
- #define WSP_GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
1070
+ #define WSP_GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071
+ #define WSP_GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
1072
+ #define WSP_GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
1057
1073
  #define WSP_GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1058
1074
  #define WSP_GGML_F32Cx4_FMA WSP_GGML_F32x4_FMA
1059
1075
  #define WSP_GGML_F32Cx4_ADD __lsx_vfadd_s
@@ -1120,11 +1136,6 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
1120
1136
  #define WSP_GGML_F16_EPR WSP_GGML_F32_EPR
1121
1137
 
1122
1138
  static inline float32x4_t __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
1123
- #if defined(__NNPA__)
1124
- uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)x);
1125
- uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1126
- return vec_extend_to_fp32_hi(v_xd, 0);
1127
- #else
1128
1139
  float tmp[4];
1129
1140
 
1130
1141
  for (int i = 0; i < 4; i++) {
@@ -1134,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
1134
1145
  // note: keep type-cast here to prevent compiler bugs
1135
1146
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
1136
1147
  return vec_xl(0, (const float *)(tmp));
1137
- #endif
1138
1148
  }
1139
1149
 
1140
1150
  static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
1141
- #if defined(__NNPA__)
1142
- float32x4_t v_zero = vec_splats(0.0f);
1143
- uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1144
- uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1145
-
1146
- x[0] = vec_extract(v_x, 0);
1147
- x[1] = vec_extract(v_x, 1);
1148
- x[2] = vec_extract(v_x, 2);
1149
- x[3] = vec_extract(v_x, 3);
1150
- #else
1151
1151
  float arr[4];
1152
1152
 
1153
1153
  // note: keep type-cast here to prevent compiler bugs
@@ -1157,7 +1157,6 @@ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
1157
1157
  for (int i = 0; i < 4; i++) {
1158
1158
  x[i] = WSP_GGML_CPU_FP32_TO_FP16(arr[i]);
1159
1159
  }
1160
- #endif
1161
1160
  }
1162
1161
 
1163
1162
  #define WSP_GGML_F16_VEC WSP_GGML_F32x4
@@ -1170,6 +1169,36 @@ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
1170
1169
  #define WSP_GGML_F16_VEC_MUL WSP_GGML_F32x4_MUL
1171
1170
  #define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32x4_REDUCE
1172
1171
 
1172
+ #elif defined(__riscv_v_intrinsic)
1173
+
1174
+ // compatible with vlen >= 128
1175
+
1176
+ #define WSP_GGML_SIMD
1177
+
1178
+ // F32
1179
+
1180
+ #define WSP_GGML_F32_STEP 16
1181
+ #define WSP_GGML_F32_EPR 4
1182
+
1183
+ #define WSP_GGML_F32x4 vfloat32m1_t
1184
+ #define WSP_GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, WSP_GGML_F32_EPR)
1185
+ #define WSP_GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, WSP_GGML_F32_EPR)
1186
+ #define WSP_GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, WSP_GGML_F32_EPR)
1187
+ #define WSP_GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, WSP_GGML_F32_EPR)
1188
+ #define WSP_GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, WSP_GGML_F32_EPR)
1189
+ #define WSP_GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, WSP_GGML_F32_EPR)
1190
+ #define WSP_GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, WSP_GGML_F32_EPR)
1191
+
1192
+ #define WSP_GGML_F32_VEC WSP_GGML_F32x4
1193
+ #define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO
1194
+ #define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1
1195
+ #define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD
1196
+ #define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE
1197
+ #define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA
1198
+ #define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD
1199
+ #define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL
1200
+ #define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE
1201
+
1173
1202
  #endif
1174
1203
 
1175
1204
  // WSP_GGML_F32_ARR / WSP_GGML_F16_ARR
@@ -84,6 +84,22 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
84
84
  }
85
85
  // reduce sum1,sum2 to sum1
86
86
  WSP_GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
+ #elif defined(__riscv_v_intrinsic)
88
+ int vl = __riscv_vsetvlmax_e32m8();
89
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
90
+ vfloat32m8_t vsum;
91
+ vfloat32m8_t ax;
92
+ vfloat32m8_t ay;
93
+ vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
94
+ for (int i = 0; i < n; i += vl) {
95
+ vl = __riscv_vsetvl_e32m8(n - i);
96
+ ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
97
+ ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
98
+ vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
99
+ }
100
+ vl = __riscv_vsetvlmax_e32m8();
101
+ vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
102
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
87
103
  #else
88
104
  const int np = (n & ~(WSP_GGML_F32_STEP - 1));
89
105
 
@@ -197,38 +213,125 @@ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggm
197
213
 
198
214
  wsp_ggml_float sumf = 0.0;
199
215
 
200
- #if defined(WSP_GGML_SIMD)
201
- const int np = (n & ~(WSP_GGML_F16_STEP - 1));
202
216
 
203
- WSP_GGML_F16_VEC sum[WSP_GGML_F16_ARR] = { WSP_GGML_F16_VEC_ZERO };
217
+ #if defined(WSP_GGML_SIMD)
218
+ #if defined(__ARM_FEATURE_SVE)
219
+ const int sve_register_length = svcntb() * 8; //get vector length
220
+ const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
221
+ const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
222
+
223
+ const int np= (n & ~(wsp_ggml_f16_step - 1));
224
+ svfloat16_t sum1 = svdup_n_f16(0.0f);
225
+ svfloat16_t sum2 = svdup_n_f16(0.0f);
226
+ svfloat16_t sum3 = svdup_n_f16(0.0f);
227
+ svfloat16_t sum4 = svdup_n_f16(0.0f);
228
+
229
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
230
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
231
+ for (int i = 0; i < np; i += wsp_ggml_f16_step) {
232
+ ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
233
+ ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
234
+ sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax1, ay1);
235
+
236
+ ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
237
+ ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
238
+ sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax2, ay2);
239
+
240
+ ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
241
+ ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
242
+ sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax3, ay3);
243
+
244
+ ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
245
+ ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
246
+ sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax4, ay4);
247
+
248
+ ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
249
+ ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
250
+ sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax5, ay5);
251
+
252
+ ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
253
+ ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
254
+ sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax6, ay6);
255
+
256
+ ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
257
+ ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
258
+ sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax7, ay7);
259
+
260
+ ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
261
+ ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
262
+ sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax8, ay8);
263
+ }
204
264
 
205
- WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
206
- WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
265
+ const int np2 = (n & ~(wsp_ggml_f16_epr - 1)); // round down to multiple of 8
266
+ for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
267
+ svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
268
+ svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
269
+ sum1 = WSP_GGML_F16x_VEC_FMA(sum1, rx, ry);
270
+ }
207
271
 
208
- for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
209
- for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
210
- ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
211
- ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
272
+ if (np2 < n) {
273
+ svbool_t pg = svwhilelt_b16(np2, n);
274
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
275
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
212
276
 
213
- sum[j] = WSP_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
277
+ sum1 = svmad_f16_x(pg, hx, hy, sum1);
214
278
  }
215
- }
279
+ WSP_GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
280
+ #elif defined(__riscv_v_intrinsic)
281
+ #if defined(__riscv_zvfh)
282
+ int vl = __riscv_vsetvlmax_e32m2();
283
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
284
+ vfloat32m2_t vsum;
285
+ vfloat16m1_t ax;
286
+ vfloat16m1_t ay;
287
+ vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
288
+ for (int i = 0; i < n; i += vl) {
289
+ vl = __riscv_vsetvl_e16m1(n - i);
290
+ ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
291
+ ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
292
+ vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
293
+ }
294
+ vl = __riscv_vsetvlmax_e32m1();
295
+ vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
296
+ vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
297
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
298
+ #else
299
+ for (int i = 0; i < n; ++i) {
300
+ sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
301
+ }
302
+ #endif // __riscv_zvfh
303
+ #else
304
+ const int np = (n & ~(WSP_GGML_F16_STEP - 1));
216
305
 
217
- // reduce sum0..sum3 to sum0
218
- WSP_GGML_F16_VEC_REDUCE(sumf, sum);
306
+ WSP_GGML_F16_VEC sum[WSP_GGML_F16_ARR] = { WSP_GGML_F16_VEC_ZERO };
219
307
 
220
- // leftovers
221
- for (int i = np; i < n; ++i) {
222
- sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
223
- }
308
+ WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
309
+ WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
310
+
311
+ for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
312
+ for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
313
+ ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
314
+ ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
224
315
 
225
- // if you hit this, you are likely running outside the FP range
226
- assert(!isnan(sumf) && !isinf(sumf));
316
+ sum[j] = WSP_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
317
+ }
318
+ }
319
+
320
+ // reduce sum0..sum3 to sum0
321
+ WSP_GGML_F16_VEC_REDUCE(sumf, sum);
322
+
323
+ // leftovers
324
+ for (int i = np; i < n; ++i) {
325
+ sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
326
+ }
327
+ // if you hit this, you are likely running outside the FP range
328
+ assert(!isnan(sumf) && !isinf(sumf));
329
+ #endif
227
330
  #else
228
331
  for (int i = 0; i < n; ++i) {
229
332
  sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
230
333
  }
231
- #endif
334
+ #endif // WSP_GGML_SIMD
232
335
 
233
336
  *s = sumf;
234
337
  }
@@ -247,6 +350,12 @@ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) {
247
350
  for (; i + 3 < n; i += 4) {
248
351
  _mm_storeu_ps(y + i, wsp_ggml_v_silu(_mm_loadu_ps(x + i)));
249
352
  }
353
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
354
+ const int vlen = svcntw();
355
+ for (; i < n; i += vlen) {
356
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
357
+ svst1_f32(pg, y + i, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)));
358
+ }
250
359
  #elif defined(__ARM_NEON) && defined(__aarch64__)
251
360
  for (; i + 3 < n; i += 4) {
252
361
  vst1q_f32(y + i, wsp_ggml_v_silu(vld1q_f32(x + i)));
@@ -271,10 +380,24 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
271
380
  for (; i + 3 < n; i += 4) {
272
381
  _mm_storeu_ps(y + i, _mm_mul_ps(wsp_ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
273
382
  }
383
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
384
+ const int vlen = svcntw();
385
+ for (; i < n; i += vlen) {
386
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
387
+ svst1_f32(pg, y + i, svmul_f32_x(pg, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
388
+ }
274
389
  #elif defined(__ARM_NEON) && defined(__aarch64__)
275
390
  for (; i + 3 < n; i += 4) {
276
391
  vst1q_f32(y + i, vmulq_f32(wsp_ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
277
392
  }
393
+ #elif defined(__riscv_v_intrinsic)
394
+ for (int vl; i < n; i += vl) {
395
+ vl = __riscv_vsetvl_e32m2(n - i);
396
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
397
+ vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
398
+ vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(wsp_ggml_v_silu_m2(vx, vl), vg, vl);
399
+ __riscv_vse32_v_f32m2(&y[i], vy, vl);
400
+ }
278
401
  #endif
279
402
  for (; i < n; ++i) {
280
403
  y[i] = wsp_ggml_silu_f32(x[i]) * g[i];
@@ -318,6 +441,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
318
441
  #endif
319
442
  sum += (wsp_ggml_float)_mm_cvtss_f32(val);
320
443
  }
444
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
445
+ const int vlen = svcntw();
446
+ for (; i < n; i += vlen) {
447
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
448
+ svfloat32_t val = wsp_ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
449
+ svdup_n_f32_x(pg, max)));
450
+ svst1_f32(pg, y + i, val);
451
+ sum += (wsp_ggml_float)svaddv_f32(pg, val);
452
+ }
321
453
  #elif defined(__ARM_NEON) && defined(__aarch64__)
322
454
  for (; i + 3 < n; i += 4) {
323
455
  float32x4_t val = wsp_ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
@@ -325,6 +457,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
325
457
  vst1q_f32(y + i, val);
326
458
  sum += (wsp_ggml_float)vaddvq_f32(val);
327
459
  }
460
+ #elif defined(__riscv_v_intrinsic)
461
+ vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
462
+ for (int avl; i < n; i += avl) {
463
+ avl = __riscv_vsetvl_e32m2(n - i);
464
+ vfloat32m2_t val = wsp_ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
465
+ __riscv_vse32_v_f32m2(&y[i], val, avl);
466
+ vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
467
+ }
468
+ return (wsp_ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
328
469
  #endif
329
470
  for (; i < n; ++i) {
330
471
  float val = expf(x[i] - max);