whisper.rn 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/android/src/main/java/com/rnwhisper/RNWhisper.java +24 -18
  2. package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +1 -57
  3. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  9. package/cpp/ggml-backend.cpp +36 -18
  10. package/cpp/ggml-backend.h +1 -1
  11. package/cpp/ggml-cpu/amx/mmq.cpp +10 -9
  12. package/cpp/ggml-cpu/arch/arm/quants.c +109 -108
  13. package/cpp/ggml-cpu/arch/arm/repack.cpp +13 -12
  14. package/cpp/ggml-cpu/arch/x86/quants.c +83 -82
  15. package/cpp/ggml-cpu/arch/x86/repack.cpp +20 -19
  16. package/cpp/ggml-cpu/common.h +3 -2
  17. package/cpp/ggml-cpu/ggml-cpu-impl.h +9 -3
  18. package/cpp/ggml-cpu/ggml-cpu.c +95 -17
  19. package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
  20. package/cpp/ggml-cpu/ops.cpp +775 -74
  21. package/cpp/ggml-cpu/ops.h +7 -0
  22. package/cpp/ggml-cpu/quants.c +25 -24
  23. package/cpp/ggml-cpu/repack.cpp +15 -14
  24. package/cpp/ggml-cpu/simd-mappings.h +211 -33
  25. package/cpp/ggml-cpu/vec.cpp +26 -2
  26. package/cpp/ggml-cpu/vec.h +99 -45
  27. package/cpp/ggml-cpu.h +2 -0
  28. package/cpp/ggml-impl.h +125 -183
  29. package/cpp/ggml-metal-impl.h +27 -0
  30. package/cpp/ggml-metal.m +298 -41
  31. package/cpp/ggml-quants.c +6 -6
  32. package/cpp/ggml-whisper-sim.metallib +0 -0
  33. package/cpp/ggml-whisper.metallib +0 -0
  34. package/cpp/ggml.c +269 -40
  35. package/cpp/ggml.h +122 -2
  36. package/cpp/gguf.cpp +5 -1
  37. package/cpp/whisper.cpp +4 -0
  38. package/cpp/whisper.h +2 -0
  39. package/ios/RNWhisper.mm +35 -38
  40. package/ios/RNWhisperVadContext.h +1 -1
  41. package/ios/RNWhisperVadContext.mm +2 -6
  42. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  43. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  44. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  45. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  50. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  51. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  52. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  53. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
  55. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  58. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  59. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  60. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  61. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  62. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
  63. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
  64. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  65. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  66. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
  67. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
  68. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
  69. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
  70. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
  71. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
  72. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  73. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  74. package/package.json +1 -1
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include <cmath>
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(wsp_ggml_fp16_t *x, wsp_ggml_fp16_t
39
40
  float tmp[16];
40
41
 
41
42
  for (int i = 0; i < 8; i++) {
42
- tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
43
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
43
44
  }
44
45
 
45
46
  for (int i = 0; i < 8; i++) {
46
- tmp[i + 8] = WSP_GGML_FP16_TO_FP32(y[i]);
47
+ tmp[i + 8] = WSP_GGML_CPU_FP16_TO_FP32(y[i]);
47
48
  }
48
49
 
49
50
  return _mm512_loadu_ps(tmp);
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
54
55
  _mm_storeu_si128((__m128i*)tmphalf, x);
55
56
 
56
57
  for (int i = 0; i < 4; i++) {
57
- tmp[i] = WSP_GGML_FP16_TO_FP32(tmphalf[i]);
58
- tmp[i + 4] = WSP_GGML_FP16_TO_FP32(tmphalf[i]);
59
- tmp[i + 8] = WSP_GGML_FP16_TO_FP32(tmphalf[i]);
60
- tmp[i + 12] = WSP_GGML_FP16_TO_FP32(tmphalf[i]);
58
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
59
+ tmp[i + 4] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
60
+ tmp[i + 8] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
+ tmp[i + 12] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
62
  }
62
63
 
63
64
  return _mm512_loadu_ps(tmp);
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(wsp_ggml_fp16_t *x) {
67
68
  float tmp[8];
68
69
 
69
70
  for (int i = 0; i < 8; i++) {
70
- tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
71
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
71
72
  }
72
73
 
73
74
  return _mm256_loadu_ps(tmp);
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(wsp_ggml_fp16_t *x) {
76
77
  float tmp[8];
77
78
 
78
79
  for (int i = 0; i < 4; i++) {
79
- tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
80
- tmp[i + 4] = WSP_GGML_FP16_TO_FP32(x[i]);
80
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
81
+ tmp[i + 4] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
81
82
  }
82
83
 
83
84
  return _mm256_loadu_ps(tmp);
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(wsp_ggml_fp16_t *x, __m128i ar
88
89
 
89
90
  _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
90
91
  for (int i = 0; i < 8; i++) {
91
- tmp[i] = WSP_GGML_FP16_TO_FP32(tmphalf[i]);
92
+ tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
92
93
  }
93
94
 
94
95
  return _mm256_loadu_ps(tmp);
@@ -211,7 +212,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x8(const float * WSP_GGML_RESTRICT x, void
211
212
  id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
212
213
 
213
214
  // Store the scale for the individual block
214
- y[i].d[row_iter] = WSP_GGML_FP32_TO_FP16(d);
215
+ y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
215
216
 
216
217
  // Store the values in blocks of eight values - Aim is to use these later for block interleaving
217
218
  srcv[row_iter][0] = v0;
@@ -297,7 +298,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x8(const float * WSP_GGML_RESTRICT x, void
297
298
  const float d = amax / ((1 << 7) - 1);
298
299
  id[row_iter] = d ? 1.0f / d : 0.0f;
299
300
 
300
- y[i].d[row_iter] = WSP_GGML_FP32_TO_FP16(d);
301
+ y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
301
302
  }
302
303
 
303
304
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -647,7 +648,7 @@ void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
647
648
  const __m256 col_scale_f32 = WSP_GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
648
649
 
649
650
  // Load and convert to FP32 scale from block_q8_0
650
- const __m256 row_scale_f32 = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(a_ptr[b].d));
651
+ const __m256 row_scale_f32 = _mm256_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
651
652
 
652
653
  // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
653
654
  __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@@ -706,7 +707,7 @@ void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
706
707
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
707
708
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
708
709
  }
709
- sumf[j] += sumi * WSP_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_FP16_TO_FP32(a_ptr[l].d);
710
+ sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
710
711
  }
711
712
  }
712
713
  }
@@ -972,13 +973,13 @@ void wsp_ggml_gemv_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
972
973
  sumi2 = sumi2 * scales_1[j];
973
974
  sumi += sumi1 + sumi2;
974
975
  }
975
- sumf[j] += sumi * WSP_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
+ sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
977
  }
977
978
  }
978
979
  for (int sb = 0; sb < 8; sb++) {
979
980
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
980
981
  for (int j = 0; j < ncols_interleaved; j++) {
981
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * WSP_GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
983
  }
983
984
  }
984
985
  }
@@ -1755,7 +1756,7 @@ void wsp_ggml_gemm_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
1755
1756
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1756
1757
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1757
1758
  }
1758
- sumf[m][j] += sumi * WSP_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1759
+ sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1759
1760
  }
1760
1761
  }
1761
1762
  }
@@ -3259,7 +3260,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3259
3260
  sumi2 = sumi2 * scales_1[j];
3260
3261
  sumi += sumi1 + sumi2;
3261
3262
  }
3262
- sumf[m][j] += sumi * WSP_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
+ sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
3264
  }
3264
3265
  }
3265
3266
  }
@@ -3268,7 +3269,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
3268
3269
  for(int m = 0; m < 4; m++) {
3269
3270
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
3270
3271
  for(int j = 0; j < ncols_interleaved; j++) {
3271
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * WSP_GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
3273
  }
3273
3274
  }
3274
3275
  }
@@ -4,6 +4,7 @@
4
4
  #include "traits.h"
5
5
  #include "ggml-cpu-impl.h"
6
6
  #include "ggml-impl.h"
7
+ #include "simd-mappings.h"
7
8
 
8
9
  #ifdef __cplusplus
9
10
 
@@ -12,11 +13,11 @@
12
13
  // convenience functions/macros for use in template calls
13
14
  // note: these won't be required after the 'traits' lookup table is used.
14
15
  static inline wsp_ggml_fp16_t f32_to_f16(float x) {
15
- return WSP_GGML_FP32_TO_FP16(x);
16
+ return WSP_GGML_CPU_FP32_TO_FP16(x);
16
17
  }
17
18
 
18
19
  static inline float f16_to_f32(wsp_ggml_fp16_t x) {
19
- return WSP_GGML_FP16_TO_FP32(x);
20
+ return WSP_GGML_CPU_FP16_TO_FP32(x);
20
21
  }
21
22
 
22
23
  static inline wsp_ggml_bf16_t f32_to_bf16(float x) {
@@ -62,11 +62,17 @@ struct wsp_ggml_compute_params {
62
62
  #if defined(__s390x__) && defined(__VEC__)
63
63
  #ifndef __VXE__
64
64
  #define __VXE__
65
- #endif
65
+ #endif // __VXE__
66
66
  #ifndef __VXE2__
67
67
  #define __VXE2__
68
- #endif
69
- #endif
68
+ #endif // __VXE2__
69
+ #endif // __s390x__ && __VEC__
70
+
71
+ #if defined(__s390x__) && defined(WSP_GGML_NNPA)
72
+ #ifndef __NNPA__
73
+ #define __NNPA__
74
+ #endif // __NNPA__
75
+ #endif // __s390x__ && WSP_GGML_NNPA
70
76
 
71
77
  #if defined(__ARM_FEATURE_SVE)
72
78
  #include <sys/prctl.h>
@@ -72,6 +72,9 @@
72
72
  #define UNUSED WSP_GGML_UNUSED
73
73
  #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
74
74
 
75
+ // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
76
+ float wsp_ggml_table_f32_f16[1 << 16];
77
+
75
78
  #if defined(__ARM_ARCH)
76
79
  struct wsp_ggml_arm_arch_features_type {
77
80
  int sve_cnt;
@@ -192,6 +195,7 @@ typedef pthread_t wsp_ggml_thread_t;
192
195
 
193
196
  static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT] = {
194
197
  [WSP_GGML_TYPE_F32] = {
198
+ .from_float = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_fp32,
195
199
  .vec_dot = (wsp_ggml_vec_dot_t) wsp_ggml_vec_dot_f32,
196
200
  .vec_dot_type = WSP_GGML_TYPE_F32,
197
201
  .nrows = 1,
@@ -736,7 +740,7 @@ struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int3
736
740
  {
737
741
  assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
738
742
  for (int i = 0; i < n; i++) {
739
- wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_FP32_TO_FP16(value));
743
+ wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_CPU_FP32_TO_FP16(value));
740
744
  }
741
745
  } break;
742
746
  case WSP_GGML_TYPE_BF16:
@@ -795,7 +799,7 @@ struct wsp_ggml_tensor * wsp_ggml_set_f32(struct wsp_ggml_tensor * tensor, float
795
799
  {
796
800
  assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
797
801
  for (int i = 0; i < n; i++) {
798
- wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_FP32_TO_FP16(value));
802
+ wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_CPU_FP32_TO_FP16(value));
799
803
  }
800
804
  } break;
801
805
  case WSP_GGML_TYPE_BF16:
@@ -846,7 +850,7 @@ int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i) {
846
850
  case WSP_GGML_TYPE_F16:
847
851
  {
848
852
  WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
849
- return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
853
+ return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
850
854
  }
851
855
  case WSP_GGML_TYPE_BF16:
852
856
  {
@@ -891,7 +895,7 @@ void wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t v
891
895
  case WSP_GGML_TYPE_F16:
892
896
  {
893
897
  WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
894
- ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_FP32_TO_FP16(value);
898
+ ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_CPU_FP32_TO_FP16(value);
895
899
  } break;
896
900
  case WSP_GGML_TYPE_BF16:
897
901
  {
@@ -920,7 +924,7 @@ int32_t wsp_ggml_get_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i
920
924
  case WSP_GGML_TYPE_I32:
921
925
  return ((int32_t *) data)[0];
922
926
  case WSP_GGML_TYPE_F16:
923
- return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
927
+ return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
924
928
  case WSP_GGML_TYPE_BF16:
925
929
  return WSP_GGML_BF16_TO_FP32(((wsp_ggml_bf16_t *) data)[0]);
926
930
  case WSP_GGML_TYPE_F32:
@@ -947,7 +951,7 @@ void wsp_ggml_set_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
947
951
  } break;
948
952
  case WSP_GGML_TYPE_F16:
949
953
  {
950
- ((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_FP32_TO_FP16(value);
954
+ ((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_CPU_FP32_TO_FP16(value);
951
955
  } break;
952
956
  case WSP_GGML_TYPE_BF16:
953
957
  {
@@ -985,7 +989,7 @@ float wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i) {
985
989
  }
986
990
  case WSP_GGML_TYPE_F16:
987
991
  {
988
- return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
992
+ return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
989
993
  }
990
994
  case WSP_GGML_TYPE_BF16:
991
995
  {
@@ -1024,7 +1028,7 @@ void wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float val
1024
1028
  } break;
1025
1029
  case WSP_GGML_TYPE_F16:
1026
1030
  {
1027
- ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_FP32_TO_FP16(value);
1031
+ ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_CPU_FP32_TO_FP16(value);
1028
1032
  } break;
1029
1033
  case WSP_GGML_TYPE_BF16:
1030
1034
  {
@@ -1051,7 +1055,7 @@ float wsp_ggml_get_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
1051
1055
  case WSP_GGML_TYPE_I32:
1052
1056
  return ((int32_t *) data)[0];
1053
1057
  case WSP_GGML_TYPE_F16:
1054
- return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
1058
+ return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
1055
1059
  case WSP_GGML_TYPE_BF16:
1056
1060
  return WSP_GGML_BF16_TO_FP32(((wsp_ggml_bf16_t *) data)[0]);
1057
1061
  case WSP_GGML_TYPE_F32:
@@ -1078,7 +1082,7 @@ void wsp_ggml_set_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
1078
1082
  } break;
1079
1083
  case WSP_GGML_TYPE_F16:
1080
1084
  {
1081
- ((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_FP32_TO_FP16(value);
1085
+ ((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_CPU_FP32_TO_FP16(value);
1082
1086
  } break;
1083
1087
  case WSP_GGML_TYPE_BF16:
1084
1088
  {
@@ -1189,7 +1193,7 @@ static void wsp_ggml_compute_forward_mul_mat_one_chunk(
1189
1193
  }
1190
1194
  }
1191
1195
 
1192
- static void wsp_ggml_compute_forward_mul_mat(
1196
+ void wsp_ggml_compute_forward_mul_mat(
1193
1197
  const struct wsp_ggml_compute_params * params,
1194
1198
  struct wsp_ggml_tensor * dst) {
1195
1199
 
@@ -1814,6 +1818,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1814
1818
  {
1815
1819
  wsp_ggml_compute_forward_get_rows_back(params, tensor);
1816
1820
  } break;
1821
+ case WSP_GGML_OP_SET_ROWS:
1822
+ {
1823
+ wsp_ggml_compute_forward_set_rows(params, tensor);
1824
+ } break;
1817
1825
  case WSP_GGML_OP_DIAG:
1818
1826
  {
1819
1827
  wsp_ggml_compute_forward_diag(params, tensor);
@@ -1858,6 +1866,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1858
1866
  {
1859
1867
  wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
1860
1868
  } break;
1869
+ case WSP_GGML_OP_CONV_2D:
1870
+ {
1871
+ wsp_ggml_compute_forward_conv_2d(params, tensor);
1872
+ } break;
1861
1873
  case WSP_GGML_OP_CONV_2D_DW:
1862
1874
  {
1863
1875
  wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -1941,6 +1953,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1941
1953
  {
1942
1954
  wsp_ggml_compute_forward_unary(params, tensor);
1943
1955
  } break;
1956
+ case WSP_GGML_OP_GLU:
1957
+ {
1958
+ wsp_ggml_compute_forward_glu(params, tensor);
1959
+ } break;
1944
1960
  case WSP_GGML_OP_GET_REL_POS:
1945
1961
  {
1946
1962
  wsp_ggml_compute_forward_get_rel_pos(params, tensor);
@@ -2151,6 +2167,18 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2151
2167
  WSP_GGML_ABORT("fatal error");
2152
2168
  }
2153
2169
  break;
2170
+ case WSP_GGML_OP_GLU:
2171
+ switch (wsp_ggml_get_glu_op(node)) {
2172
+ case WSP_GGML_GLU_OP_REGLU:
2173
+ case WSP_GGML_GLU_OP_GEGLU:
2174
+ case WSP_GGML_GLU_OP_SWIGLU:
2175
+ {
2176
+ n_tasks = n_threads;
2177
+ } break;
2178
+ default:
2179
+ WSP_GGML_ABORT("fatal error");
2180
+ }
2181
+ break;
2154
2182
  case WSP_GGML_OP_SILU_BACK:
2155
2183
  case WSP_GGML_OP_MUL:
2156
2184
  case WSP_GGML_OP_DIV:
@@ -2167,6 +2195,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2167
2195
  n_tasks = n_threads;
2168
2196
  } break;
2169
2197
  case WSP_GGML_OP_GET_ROWS:
2198
+ case WSP_GGML_OP_SET_ROWS:
2170
2199
  {
2171
2200
  // FIXME: get_rows can use additional threads, but the cost of launching additional threads
2172
2201
  // decreases performance with GPU offloading
@@ -2203,6 +2232,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2203
2232
  } break;
2204
2233
  case WSP_GGML_OP_IM2COL:
2205
2234
  case WSP_GGML_OP_IM2COL_BACK:
2235
+ case WSP_GGML_OP_CONV_2D:
2206
2236
  case WSP_GGML_OP_CONV_2D_DW:
2207
2237
  case WSP_GGML_OP_CONV_TRANSPOSE_1D:
2208
2238
  case WSP_GGML_OP_CONV_TRANSPOSE_2D:
@@ -2721,6 +2751,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2721
2751
  WSP_GGML_ABORT("fatal error");
2722
2752
  }
2723
2753
  } break;
2754
+ case WSP_GGML_OP_CONV_2D:
2755
+ {
2756
+ cur = WSP_GGML_IM2COL_WORK_SIZE;
2757
+ } break;
2724
2758
  case WSP_GGML_OP_CONV_TRANSPOSE_2D:
2725
2759
  {
2726
2760
  const int64_t ne00 = node->src[0]->ne[0]; // W
@@ -3121,6 +3155,10 @@ enum wsp_ggml_status wsp_ggml_graph_compute_with_ctx(struct wsp_ggml_context * c
3121
3155
  return wsp_ggml_graph_compute(cgraph, &cplan);
3122
3156
  }
3123
3157
 
3158
+ void wsp_ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
3159
+ memcpy(y, x, n * sizeof(float));
3160
+ }
3161
+
3124
3162
  void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n) {
3125
3163
  int64_t i = 0;
3126
3164
  #if defined(__F16C__)
@@ -3141,9 +3179,24 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
3141
3179
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3142
3180
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3143
3181
  }
3182
+ #elif defined(__NNPA__)
3183
+ for (; i + 7 < n; i += 8) {
3184
+ float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3185
+ float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3186
+ uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3187
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3188
+ vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3189
+ }
3190
+ for (; i + 3 < n; i += 4) {
3191
+ float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3192
+ float32x4_t v_zero = vec_splats(0.0f);
3193
+ uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3194
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3195
+ vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3196
+ }
3144
3197
  #endif
3145
3198
  for (; i < n; ++i) {
3146
- y[i] = WSP_GGML_FP32_TO_FP16(x[i]);
3199
+ y[i] = WSP_GGML_CPU_FP32_TO_FP16(x[i]);
3147
3200
  }
3148
3201
  }
3149
3202
 
@@ -3167,9 +3220,25 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
3167
3220
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3168
3221
  _mm_storeu_ps(y + i, y_vec);
3169
3222
  }
3223
+ #elif defined(__NNPA__)
3224
+ for (; i + 7 < n; i += 8) {
3225
+ uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3226
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3227
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3228
+ float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3229
+ vec_xst(v_yh, 0, (float *)(y + i + 0));
3230
+ vec_xst(v_yl, 0, (float *)(y + i + 4));
3231
+ }
3232
+ for (; i + 3 < n; i += 4) {
3233
+ uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3234
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3235
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3236
+ vec_xst(v_yh, 0, (float *)(y + i));
3237
+ }
3170
3238
  #endif
3239
+
3171
3240
  for (; i < n; ++i) {
3172
- y[i] = WSP_GGML_FP16_TO_FP32(x[i]);
3241
+ y[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
3173
3242
  }
3174
3243
  }
3175
3244
 
@@ -3369,6 +3438,14 @@ int wsp_ggml_cpu_has_vxe(void) {
3369
3438
  #endif
3370
3439
  }
3371
3440
 
3441
+ int wsp_ggml_cpu_has_nnpa(void) {
3442
+ #if defined(WSP_GGML_NNPA)
3443
+ return 1;
3444
+ #else
3445
+ return 0;
3446
+ #endif
3447
+ }
3448
+
3372
3449
  int wsp_ggml_cpu_has_neon(void) {
3373
3450
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3374
3451
  return 1;
@@ -3418,7 +3495,7 @@ int wsp_ggml_cpu_has_sme(void) {
3418
3495
  }
3419
3496
 
3420
3497
  void wsp_ggml_cpu_init(void) {
3421
- // needed to initialize f16 tables
3498
+ // needed to initialize wsp_ggml_time
3422
3499
  {
3423
3500
  struct wsp_ggml_init_params params = { 0, NULL, false };
3424
3501
  struct wsp_ggml_context * ctx = wsp_ggml_init(params);
@@ -3439,9 +3516,10 @@ void wsp_ggml_cpu_init(void) {
3439
3516
  uint16_t u16;
3440
3517
  wsp_ggml_fp16_t fp16;
3441
3518
  } u = {i};
3442
- float f = WSP_GGML_FP16_TO_FP32(u.fp16);
3443
- wsp_ggml_table_gelu_f16[i] = WSP_GGML_FP32_TO_FP16(wsp_ggml_gelu_f32(f));
3444
- wsp_ggml_table_gelu_quick_f16[i] = WSP_GGML_FP32_TO_FP16(wsp_ggml_gelu_quick_f32(f));
3519
+ float f = WSP_GGML_COMPUTE_FP16_TO_FP32(u.fp16);
3520
+ wsp_ggml_table_f32_f16[i] = f;
3521
+ wsp_ggml_table_gelu_f16[i] = WSP_GGML_CPU_FP32_TO_FP16(wsp_ggml_gelu_f32(f));
3522
+ wsp_ggml_table_gelu_quick_f16[i] = WSP_GGML_CPU_FP32_TO_FP16(wsp_ggml_gelu_quick_f32(f));
3445
3523
  }
3446
3524
 
3447
3525
  const uint64_t t_end = wsp_ggml_time_us(); UNUSED(t_end);
@@ -416,6 +416,7 @@ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev,
416
416
 
417
417
  switch (op->op) {
418
418
  case WSP_GGML_OP_CPY:
419
+ case WSP_GGML_OP_SET_ROWS:
419
420
  return
420
421
  op->type != WSP_GGML_TYPE_IQ3_XXS &&
421
422
  op->type != WSP_GGML_TYPE_IQ3_S &&
@@ -578,6 +579,9 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
578
579
  if (wsp_ggml_cpu_has_vxe()) {
579
580
  features.push_back({ "VXE", "1" });
580
581
  }
582
+ if (wsp_ggml_cpu_has_nnpa()) {
583
+ features.push_back({ "NNPA", "1" });
584
+ }
581
585
  if (wsp_ggml_cpu_has_wasm_simd()) {
582
586
  features.push_back({ "WASM_SIMD", "1" });
583
587
  }