whisper.rn 0.4.2 → 0.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +1 -3
  2. package/android/build.gradle +70 -11
  3. package/android/src/main/CMakeLists.txt +28 -1
  4. package/android/src/main/java/com/rnwhisper/JSCallInvokerResolver.java +40 -0
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +80 -27
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +21 -9
  7. package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +1 -1
  8. package/android/src/main/jni.cpp +79 -2
  9. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  12. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  15. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  16. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +5 -0
  17. package/cpp/ggml-backend.cpp +36 -18
  18. package/cpp/ggml-backend.h +1 -1
  19. package/cpp/ggml-cpu/amx/mmq.cpp +10 -9
  20. package/cpp/ggml-cpu/arch/arm/quants.c +109 -108
  21. package/cpp/ggml-cpu/arch/arm/repack.cpp +13 -12
  22. package/cpp/ggml-cpu/arch/x86/quants.c +83 -82
  23. package/cpp/ggml-cpu/arch/x86/repack.cpp +20 -19
  24. package/cpp/ggml-cpu/common.h +3 -2
  25. package/cpp/ggml-cpu/ggml-cpu-impl.h +9 -3
  26. package/cpp/ggml-cpu/ggml-cpu.c +95 -17
  27. package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
  28. package/cpp/ggml-cpu/ops.cpp +775 -74
  29. package/cpp/ggml-cpu/ops.h +7 -0
  30. package/cpp/ggml-cpu/quants.c +25 -24
  31. package/cpp/ggml-cpu/repack.cpp +15 -14
  32. package/cpp/ggml-cpu/simd-mappings.h +211 -33
  33. package/cpp/ggml-cpu/vec.cpp +26 -2
  34. package/cpp/ggml-cpu/vec.h +99 -45
  35. package/cpp/ggml-cpu.h +2 -0
  36. package/cpp/ggml-impl.h +125 -183
  37. package/cpp/ggml-metal-impl.h +27 -0
  38. package/cpp/ggml-metal.m +298 -41
  39. package/cpp/ggml-quants.c +6 -6
  40. package/cpp/ggml-whisper-sim.metallib +0 -0
  41. package/cpp/ggml-whisper.metallib +0 -0
  42. package/cpp/ggml.c +269 -40
  43. package/cpp/ggml.h +122 -2
  44. package/cpp/gguf.cpp +5 -1
  45. package/cpp/jsi/RNWhisperJSI.cpp +681 -0
  46. package/cpp/jsi/RNWhisperJSI.h +44 -0
  47. package/cpp/jsi/ThreadPool.h +100 -0
  48. package/cpp/whisper.cpp +4 -0
  49. package/cpp/whisper.h +2 -0
  50. package/ios/RNWhisper.h +3 -0
  51. package/ios/RNWhisper.mm +66 -31
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
  57. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
  58. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  59. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  68. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  76. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  77. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  79. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  84. package/jest/mock.js +1 -0
  85. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  86. package/lib/commonjs/index.js +83 -2
  87. package/lib/commonjs/index.js.map +1 -1
  88. package/lib/module/NativeRNWhisper.js.map +1 -1
  89. package/lib/module/index.js +83 -2
  90. package/lib/module/index.js.map +1 -1
  91. package/lib/typescript/NativeRNWhisper.d.ts +4 -0
  92. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  93. package/lib/typescript/index.d.ts +18 -6
  94. package/lib/typescript/index.d.ts.map +1 -1
  95. package/package.json +2 -3
  96. package/src/NativeRNWhisper.ts +2 -0
  97. package/src/index.ts +162 -33
  98. package/whisper-rn.podspec +6 -3
@@ -2,10 +2,167 @@
2
2
 
3
3
  #include "ggml-cpu-impl.h"
4
4
 
5
+ #ifdef __ARM_FEATURE_SVE
6
+ #include <arm_sve.h>
7
+ #endif // __ARM_FEATURE_SVE
8
+
9
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
+ //
12
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
+ //
14
+ #include <arm_neon.h>
15
+ #endif
16
+
17
+ #if defined(__F16C__)
18
+ #include <immintrin.h>
19
+ #endif
20
+
21
+ #ifdef __cplusplus
22
+ extern "C" {
23
+ #endif
24
+
5
25
  //
6
26
  // simd mappings
7
27
  //
8
28
 
29
+ // FP16 to FP32 conversion
30
+
31
+ // 16-bit float
32
+ // on Arm, we use __fp16
33
+ // on x86, we use uint16_t
34
+ //
35
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
+ //
38
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
+
42
+ #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
+
44
+ static inline float neon_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
45
+ __fp16 tmp;
46
+ memcpy(&tmp, &h, sizeof(wsp_ggml_fp16_t));
47
+ return (float)tmp;
48
+ }
49
+
50
+ static inline wsp_ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
+ wsp_ggml_fp16_t res;
52
+ __fp16 tmp = f;
53
+ memcpy(&res, &tmp, sizeof(wsp_ggml_fp16_t));
54
+ return res;
55
+ }
56
+ #elif defined(__F16C__)
57
+ #ifdef _MSC_VER
58
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
+ #else
61
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
+ #endif
64
+ #elif defined(__POWER9_VECTOR__)
65
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
+ /* the inline asm below is about 12% faster than the lookup method */
68
+ #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
+ #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
+
71
+ static inline float power_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
72
+ float f;
73
+ double d;
74
+ __asm__(
75
+ "mtfprd %0,%2\n"
76
+ "xscvhpdp %0,%0\n"
77
+ "frsp %1,%0\n" :
78
+ /* temp */ "=d"(d),
79
+ /* out */ "=f"(f):
80
+ /* in */ "r"(h));
81
+ return f;
82
+ }
83
+
84
+ static inline wsp_ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
+ double d;
86
+ wsp_ggml_fp16_t r;
87
+ __asm__( /* xscvdphp can work on double or single precision */
88
+ "xscvdphp %0,%2\n"
89
+ "mffprd %1,%0\n" :
90
+ /* temp */ "=d"(d),
91
+ /* out */ "=r"(r):
92
+ /* in */ "f"(f));
93
+ return r;
94
+ }
95
+ #elif defined(__riscv) && defined(__riscv_zfhmin)
96
+ static inline float riscv_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
97
+ float f;
98
+ __asm__(
99
+ "fmv.h.x %[f], %[h]\n\t"
100
+ "fcvt.s.h %[f], %[f]"
101
+ : [f] "=&f" (f)
102
+ : [h] "r" (h)
103
+ );
104
+ return f;
105
+ }
106
+
107
+ static inline wsp_ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
+ wsp_ggml_fp16_t res;
109
+ __asm__(
110
+ "fcvt.h.s %[f], %[f]\n\t"
111
+ "fmv.x.h %[h], %[f]"
112
+ : [h] "=&r" (res)
113
+ : [f] "f" (f)
114
+ );
115
+ return res;
116
+ }
117
+
118
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
119
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
120
+ #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
121
+ #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
122
+ #elif defined(__NNPA__)
123
+ #define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
124
+ #define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
125
+
126
+ #define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
127
+ #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
128
+
129
+ static inline float nnpa_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
130
+ uint16x8_t v_h = vec_splats(h);
131
+ uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
132
+ return vec_extend_to_fp32_hi(v_hd, 0)[0];
133
+ }
134
+
135
+ static inline wsp_ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
136
+ float32x4_t v_f = vec_splats(f);
137
+ float32x4_t v_zero = vec_splats(0.0f);
138
+ uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
139
+ uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
140
+ return vec_extract(v_h, 0);
141
+ }
142
+ #endif
143
+
144
+ // precomputed f32 table for f16 (256 KB)
145
+ // defined in ggml-cpu.c, initialized in wsp_ggml_cpu_init()
146
+ extern float wsp_ggml_table_f32_f16[1 << 16];
147
+
148
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into wsp_ggml_lookup_fp16_to_fp32,
149
+ // so we define WSP_GGML_CPU_FP16_TO_FP32 and WSP_GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
150
+ // This is also true for POWER9.
151
+ #if !defined(WSP_GGML_CPU_FP16_TO_FP32)
152
+ inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) {
153
+ uint16_t s;
154
+ memcpy(&s, &f, sizeof(uint16_t));
155
+ return wsp_ggml_table_f32_f16[s];
156
+ }
157
+
158
+ #define WSP_GGML_CPU_FP16_TO_FP32(x) wsp_ggml_lookup_fp16_to_fp32(x)
159
+ #endif
160
+
161
+ #if !defined(WSP_GGML_CPU_FP32_TO_FP16)
162
+ #define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x)
163
+ #endif
164
+
165
+
9
166
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
10
167
  // we then implement the fundamental computation operations below using only these macros
11
168
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const wsp_ggml_fp16_t * x) {
415
572
  float tmp[8];
416
573
 
417
574
  for (int i = 0; i < 8; i++) {
418
- tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
575
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
419
576
  }
420
577
 
421
578
  return _mm256_loadu_ps(tmp);
@@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(wsp_ggml_fp16_t *x, __m256 y) {
426
583
  _mm256_storeu_ps(arr, y);
427
584
 
428
585
  for (int i = 0; i < 8; i++)
429
- x[i] = WSP_GGML_FP32_TO_FP16(arr[i]);
586
+ x[i] = WSP_GGML_CPU_FP32_TO_FP16(arr[i]);
430
587
  }
431
588
  #define WSP_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
432
589
  #define WSP_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -574,10 +731,10 @@ static inline unsigned char wsp_ggml_endian_byte(int i) {
574
731
  inline static v128_t __wasm_f16x4_load(const wsp_ggml_fp16_t * p) {
575
732
  float tmp[4];
576
733
 
577
- tmp[0] = WSP_GGML_FP16_TO_FP32(p[0]);
578
- tmp[1] = WSP_GGML_FP16_TO_FP32(p[1]);
579
- tmp[2] = WSP_GGML_FP16_TO_FP32(p[2]);
580
- tmp[3] = WSP_GGML_FP16_TO_FP32(p[3]);
734
+ tmp[0] = WSP_GGML_CPU_FP16_TO_FP32(p[0]);
735
+ tmp[1] = WSP_GGML_CPU_FP16_TO_FP32(p[1]);
736
+ tmp[2] = WSP_GGML_CPU_FP16_TO_FP32(p[2]);
737
+ tmp[3] = WSP_GGML_CPU_FP16_TO_FP32(p[3]);
581
738
 
582
739
  return wasm_v128_load(tmp);
583
740
  }
@@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) {
587
744
 
588
745
  wasm_v128_store(tmp, x);
589
746
 
590
- p[0] = WSP_GGML_FP32_TO_FP16(tmp[0]);
591
- p[1] = WSP_GGML_FP32_TO_FP16(tmp[1]);
592
- p[2] = WSP_GGML_FP32_TO_FP16(tmp[2]);
593
- p[3] = WSP_GGML_FP32_TO_FP16(tmp[3]);
747
+ p[0] = WSP_GGML_CPU_FP32_TO_FP16(tmp[0]);
748
+ p[1] = WSP_GGML_CPU_FP32_TO_FP16(tmp[1]);
749
+ p[2] = WSP_GGML_CPU_FP32_TO_FP16(tmp[2]);
750
+ p[3] = WSP_GGML_CPU_FP32_TO_FP16(tmp[3]);
594
751
  }
595
752
 
596
753
  #define WSP_GGML_F16x4 v128_t
@@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) {
690
847
  static inline __m128 __sse_f16x4_load(const wsp_ggml_fp16_t * x) {
691
848
  float tmp[4];
692
849
 
693
- tmp[0] = WSP_GGML_FP16_TO_FP32(x[0]);
694
- tmp[1] = WSP_GGML_FP16_TO_FP32(x[1]);
695
- tmp[2] = WSP_GGML_FP16_TO_FP32(x[2]);
696
- tmp[3] = WSP_GGML_FP16_TO_FP32(x[3]);
850
+ tmp[0] = WSP_GGML_CPU_FP16_TO_FP32(x[0]);
851
+ tmp[1] = WSP_GGML_CPU_FP16_TO_FP32(x[1]);
852
+ tmp[2] = WSP_GGML_CPU_FP16_TO_FP32(x[2]);
853
+ tmp[3] = WSP_GGML_CPU_FP16_TO_FP32(x[3]);
697
854
 
698
855
  return _mm_loadu_ps(tmp);
699
856
  }
@@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
703
860
 
704
861
  _mm_storeu_ps(arr, y);
705
862
 
706
- x[0] = WSP_GGML_FP32_TO_FP16(arr[0]);
707
- x[1] = WSP_GGML_FP32_TO_FP16(arr[1]);
708
- x[2] = WSP_GGML_FP32_TO_FP16(arr[2]);
709
- x[3] = WSP_GGML_FP32_TO_FP16(arr[3]);
863
+ x[0] = WSP_GGML_CPU_FP32_TO_FP16(arr[0]);
864
+ x[1] = WSP_GGML_CPU_FP32_TO_FP16(arr[1]);
865
+ x[2] = WSP_GGML_CPU_FP32_TO_FP16(arr[2]);
866
+ x[3] = WSP_GGML_CPU_FP32_TO_FP16(arr[3]);
710
867
  }
711
868
 
712
869
  #define WSP_GGML_F32Cx4 __m128
@@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
828
985
  #define WSP_GGML_F32x4_ZERO __lsx_vldi(0)
829
986
  #define WSP_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
830
987
  #define WSP_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
831
- #define WSP_GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
988
+ #define WSP_GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
832
989
  #define WSP_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
833
990
  #define WSP_GGML_F32x4_ADD __lsx_vfadd_s
834
991
  #define WSP_GGML_F32x4_MUL __lsx_vfmul_s
@@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
874
1031
  static inline __m128 __lsx_f16x4_load(const wsp_ggml_fp16_t * x) {
875
1032
  float tmp[4];
876
1033
 
877
- tmp[0] = WSP_GGML_FP16_TO_FP32(x[0]);
878
- tmp[1] = WSP_GGML_FP16_TO_FP32(x[1]);
879
- tmp[2] = WSP_GGML_FP16_TO_FP32(x[2]);
880
- tmp[3] = WSP_GGML_FP16_TO_FP32(x[3]);
1034
+ tmp[0] = WSP_GGML_CPU_FP16_TO_FP32(x[0]);
1035
+ tmp[1] = WSP_GGML_CPU_FP16_TO_FP32(x[1]);
1036
+ tmp[2] = WSP_GGML_CPU_FP16_TO_FP32(x[2]);
1037
+ tmp[3] = WSP_GGML_CPU_FP16_TO_FP32(x[3]);
881
1038
 
882
1039
  return __lsx_vld(tmp, 0);
883
1040
  }
@@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
887
1044
 
888
1045
  __lsx_vst(y, arr, 0);
889
1046
 
890
- x[0] = WSP_GGML_FP32_TO_FP16(arr[0]);
891
- x[1] = WSP_GGML_FP32_TO_FP16(arr[1]);
892
- x[2] = WSP_GGML_FP32_TO_FP16(arr[2]);
893
- x[3] = WSP_GGML_FP32_TO_FP16(arr[3]);
1047
+ x[0] = WSP_GGML_CPU_FP32_TO_FP16(arr[0]);
1048
+ x[1] = WSP_GGML_CPU_FP32_TO_FP16(arr[1]);
1049
+ x[2] = WSP_GGML_CPU_FP32_TO_FP16(arr[2]);
1050
+ x[3] = WSP_GGML_CPU_FP32_TO_FP16(arr[3]);
894
1051
  }
895
1052
 
896
1053
  #define WSP_GGML_F32Cx4 __m128
@@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
922
1079
  #define WSP_GGML_F32_STEP 32
923
1080
  #define WSP_GGML_F32_EPR 4
924
1081
 
925
- #define WSP_GGML_F32x4 __vector float
1082
+ #define WSP_GGML_F32x4 float32x4_t
926
1083
  #define WSP_GGML_F32x4_ZERO vec_splats(0.0f)
927
1084
  #define WSP_GGML_F32x4_SET1 vec_splats
928
1085
  #define WSP_GGML_F32x4_LOAD(p) vec_xl(0, p)
@@ -962,28 +1119,45 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
962
1119
  #define WSP_GGML_F16_STEP WSP_GGML_F32_STEP
963
1120
  #define WSP_GGML_F16_EPR WSP_GGML_F32_EPR
964
1121
 
965
- static inline __vector float __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
1122
+ static inline float32x4_t __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
1123
+ #if defined(__NNPA__)
1124
+ uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)x);
1125
+ uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1126
+ return vec_extend_to_fp32_hi(v_xd, 0);
1127
+ #else
966
1128
  float tmp[4];
967
1129
 
968
1130
  for (int i = 0; i < 4; i++) {
969
- tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
1131
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
970
1132
  }
971
1133
 
972
1134
  // note: keep type-cast here to prevent compiler bugs
973
1135
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
974
1136
  return vec_xl(0, (const float *)(tmp));
1137
+ #endif
975
1138
  }
976
1139
 
977
- static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, __vector float y) {
1140
+ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
1141
+ #if defined(__NNPA__)
1142
+ float32x4_t v_zero = vec_splats(0.0f);
1143
+ uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1144
+ uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1145
+
1146
+ x[0] = vec_extract(v_x, 0);
1147
+ x[1] = vec_extract(v_x, 1);
1148
+ x[2] = vec_extract(v_x, 2);
1149
+ x[3] = vec_extract(v_x, 3);
1150
+ #else
978
1151
  float arr[4];
979
1152
 
980
1153
  // note: keep type-cast here to prevent compiler bugs
981
1154
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
982
- vec_xst(y, 0, (float *)(arr));
1155
+ vec_xst(v_y, 0, (float *)(arr));
983
1156
 
984
1157
  for (int i = 0; i < 4; i++) {
985
- x[i] = WSP_GGML_FP32_TO_FP16(arr[i]);
1158
+ x[i] = WSP_GGML_CPU_FP32_TO_FP16(arr[i]);
986
1159
  }
1160
+ #endif
987
1161
  }
988
1162
 
989
1163
  #define WSP_GGML_F16_VEC WSP_GGML_F32x4
@@ -1004,3 +1178,7 @@ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, __vector float y) {
1004
1178
  #define WSP_GGML_F32_ARR (WSP_GGML_F32_STEP/WSP_GGML_F32_EPR)
1005
1179
  #define WSP_GGML_F16_ARR (WSP_GGML_F16_STEP/WSP_GGML_F16_EPR)
1006
1180
  #endif
1181
+
1182
+ #ifdef __cplusplus
1183
+ }
1184
+ #endif
@@ -219,11 +219,11 @@ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggm
219
219
 
220
220
  // leftovers
221
221
  for (int i = np; i < n; ++i) {
222
- sumf += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[i])*WSP_GGML_FP16_TO_FP32(y[i]));
222
+ sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
223
223
  }
224
224
  #else
225
225
  for (int i = 0; i < n; ++i) {
226
- sumf += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[i])*WSP_GGML_FP16_TO_FP32(y[i]));
226
+ sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
227
227
  }
228
228
  #endif
229
229
 
@@ -254,6 +254,30 @@ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) {
254
254
  }
255
255
  }
256
256
 
257
+ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
258
+ int i = 0;
259
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
260
+ for (; i + 15 < n; i += 16) {
261
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(wsp_ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
262
+ }
263
+ #elif defined(__AVX2__) && defined(__FMA__)
264
+ for (; i + 7 < n; i += 8) {
265
+ _mm256_storeu_ps(y + i, _mm256_mul_ps(wsp_ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
266
+ }
267
+ #elif defined(__SSE2__)
268
+ for (; i + 3 < n; i += 4) {
269
+ _mm_storeu_ps(y + i, _mm_mul_ps(wsp_ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
270
+ }
271
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
272
+ for (; i + 3 < n; i += 4) {
273
+ vst1q_f32(y + i, vmulq_f32(wsp_ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
274
+ }
275
+ #endif
276
+ for (; i < n; ++i) {
277
+ y[i] = wsp_ggml_silu_f32(x[i]) * g[i];
278
+ }
279
+ }
280
+
257
281
  wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
258
282
  int i = 0;
259
283
  wsp_ggml_float sum = 0;