whisper.rn 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/gradle.properties +1 -1
  3. package/cpp/ggml-alloc.c +264 -126
  4. package/cpp/ggml-backend-impl.h +4 -1
  5. package/cpp/ggml-backend-reg.cpp +13 -5
  6. package/cpp/ggml-backend.cpp +207 -17
  7. package/cpp/ggml-backend.h +17 -1
  8. package/cpp/ggml-cpu/amx/amx.cpp +4 -2
  9. package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
  10. package/cpp/ggml-cpu/arch-fallback.h +0 -4
  11. package/cpp/ggml-cpu/common.h +14 -0
  12. package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
  13. package/cpp/ggml-cpu/ggml-cpu.c +48 -41
  14. package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
  15. package/cpp/ggml-cpu/ops.cpp +518 -767
  16. package/cpp/ggml-cpu/ops.h +2 -0
  17. package/cpp/ggml-cpu/simd-mappings.h +88 -59
  18. package/cpp/ggml-cpu/vec.cpp +161 -20
  19. package/cpp/ggml-cpu/vec.h +400 -51
  20. package/cpp/ggml-cpu.h +1 -1
  21. package/cpp/ggml-impl.h +43 -10
  22. package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
  23. package/cpp/ggml-metal/ggml-metal-common.h +52 -0
  24. package/cpp/ggml-metal/ggml-metal-context.h +33 -0
  25. package/cpp/ggml-metal/ggml-metal-context.m +600 -0
  26. package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
  27. package/cpp/ggml-metal/ggml-metal-device.h +226 -0
  28. package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
  29. package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
  30. package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
  31. package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
  32. package/cpp/ggml-metal/ggml-metal.cpp +718 -0
  33. package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
  34. package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
  35. package/cpp/ggml-metal-impl.h +40 -40
  36. package/cpp/ggml-metal.h +1 -6
  37. package/cpp/ggml-quants.c +1 -0
  38. package/cpp/ggml.c +175 -13
  39. package/cpp/ggml.h +84 -5
  40. package/cpp/jsi/RNWhisperJSI.cpp +2 -0
  41. package/cpp/jsi/ThreadPool.h +3 -3
  42. package/cpp/whisper.cpp +85 -70
  43. package/cpp/whisper.h +1 -0
  44. package/ios/CMakeLists.txt +6 -1
  45. package/ios/RNWhisperVadContext.mm +14 -13
  46. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  47. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  48. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  49. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  50. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  51. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  52. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  53. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  54. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
  55. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  56. package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
  57. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  58. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  59. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  60. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  61. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  62. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  63. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  64. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  65. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  66. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  67. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  68. package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  69. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  70. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  71. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  72. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  73. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  74. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  75. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
  76. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
  77. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
  78. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
  79. package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
  80. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
  81. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
  82. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
  83. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
  84. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
  85. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
  86. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
  87. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
  88. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
  89. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
  90. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
  91. package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
  92. package/lib/commonjs/version.json +1 -1
  93. package/lib/module/version.json +1 -1
  94. package/package.json +1 -1
  95. package/src/version.json +1 -1
  96. package/whisper-rn.podspec +8 -9
  97. package/cpp/ggml-metal.m +0 -6779
  98. package/cpp/ggml-whisper-sim.metallib +0 -0
  99. package/cpp/ggml-whisper.metallib +0 -0
@@ -132,6 +132,8 @@ extern "C" {
132
132
  WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
133
133
  // GPU device using dedicated memory
134
134
  WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // integrated GPU device using host memory
136
+ WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
135
137
  // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
138
  WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
137
139
  };
@@ -150,11 +152,21 @@ extern "C" {
150
152
 
151
153
  // all the device properties
152
154
  struct wsp_ggml_backend_dev_props {
155
+ // device name
153
156
  const char * name;
157
+ // device description
154
158
  const char * description;
159
+ // device free memory in bytes
155
160
  size_t memory_free;
161
+ // device total memory in bytes
156
162
  size_t memory_total;
163
+ // device type
157
164
  enum wsp_ggml_backend_dev_type type;
165
+ // device id
166
+ // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167
+ // if the id is unknown, this should be NULL
168
+ const char * device_id;
169
+ // device capabilities
158
170
  struct wsp_ggml_backend_dev_caps caps;
159
171
  };
160
172
 
@@ -302,11 +314,15 @@ extern "C" {
302
314
  WSP_GGML_API int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
303
315
  WSP_GGML_API int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
304
316
 
305
- WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
317
+ WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
318
+ WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
306
319
 
307
320
  WSP_GGML_API void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
308
321
  WSP_GGML_API wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
309
322
 
323
+ // Split graph without allocating it
324
+ WSP_GGML_API void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
325
+
310
326
  // Allocate and compute graph on the backend scheduler
311
327
  WSP_GGML_API bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
312
328
  WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
@@ -7,7 +7,7 @@
7
7
  #include "ggml-cpu.h"
8
8
  #include "traits.h"
9
9
 
10
- #if defined(__gnu_linux__)
10
+ #if defined(__linux__)
11
11
  #include <sys/syscall.h>
12
12
  #include <unistd.h>
13
13
  #endif
@@ -186,7 +186,7 @@ static size_t wsp_ggml_backend_amx_buffer_type_get_alloc_size(wsp_ggml_backend_b
186
186
  #define XFEATURE_XTILEDATA 18
187
187
 
188
188
  static bool wsp_ggml_amx_init() {
189
- #if defined(__gnu_linux__)
189
+ #if defined(__linux__)
190
190
  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
191
191
  fprintf(stderr, "AMX is not ready to be used!\n");
192
192
  return false;
@@ -194,6 +194,8 @@ static bool wsp_ggml_amx_init() {
194
194
  return true;
195
195
  #elif defined(_WIN32)
196
196
  return true;
197
+ #else
198
+ return false;
197
199
  #endif
198
200
  }
199
201
 
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
878
878
  const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879
879
  const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880
880
 
881
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
881
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882
882
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883
883
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884
884
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
1231
1231
  const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
1232
1232
  const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
1233
1233
 
1234
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
1234
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1235
1235
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1236
1236
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1237
1237
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -73,7 +73,6 @@
73
73
  #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
74
74
  #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
75
75
  #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
76
- #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
77
76
  // repack.cpp
78
77
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
79
78
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -151,8 +150,6 @@
151
150
  #elif defined(__s390x__)
152
151
  // quants.c
153
152
  #define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
154
- #define wsp_ggml_vec_dot_q5_0_q8_0_generic wsp_ggml_vec_dot_q5_0_q8_0
155
- #define wsp_ggml_vec_dot_q5_1_q8_1_generic wsp_ggml_vec_dot_q5_1_q8_1
156
153
  #define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
157
154
  #define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
158
155
  #define wsp_ggml_vec_dot_q2_K_q8_K_generic wsp_ggml_vec_dot_q2_K_q8_K
@@ -163,7 +160,6 @@
163
160
  #define wsp_ggml_vec_dot_iq3_s_q8_K_generic wsp_ggml_vec_dot_iq3_s_q8_K
164
161
  #define wsp_ggml_vec_dot_iq1_s_q8_K_generic wsp_ggml_vec_dot_iq1_s_q8_K
165
162
  #define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
166
- #define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
167
163
  // repack.cpp
168
164
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
169
165
  #define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(wsp_ggml_bf16_t x) {
28
28
  return WSP_GGML_BF16_TO_FP32(x);
29
29
  }
30
30
 
31
+ static inline float i32_to_f32(int32_t x) {
32
+ return x;
33
+ }
34
+
35
+ static inline int32_t f32_to_i32(float x) {
36
+ return x;
37
+ }
38
+
31
39
  static inline float f32_to_f32(float x) {
32
40
  return x;
33
41
  }
@@ -54,6 +62,12 @@ struct type_conversion_table<wsp_ggml_bf16_t> {
54
62
  static constexpr wsp_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
55
63
  };
56
64
 
65
+ template <>
66
+ struct type_conversion_table<int32_t> {
67
+ static constexpr float (*to_f32)(int32_t) = i32_to_f32;
68
+ static constexpr int32_t (*from_f32)(float) = f32_to_i32;
69
+ };
70
+
57
71
  static std::pair<int64_t, int64_t> get_thread_range(const struct wsp_ggml_compute_params * params, const struct wsp_ggml_tensor * src0) {
58
72
  const int64_t ith = params->ith;
59
73
  const int64_t nth = params->nth;
@@ -68,12 +68,6 @@ struct wsp_ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__s390x__) && defined(WSP_GGML_NNPA)
72
- #ifndef __NNPA__
73
- #define __NNPA__
74
- #endif // __NNPA__
75
- #endif // __s390x__ && WSP_GGML_NNPA
76
-
77
71
  #if defined(__ARM_FEATURE_SVE)
78
72
  #include <sys/prctl.h>
79
73
  #endif
@@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
486
480
  return v_abo + v_abe;
487
481
  }
488
482
 
483
+ /**
484
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
485
+ */
486
+ inline static float vec_hsum_f32x4(float32x4_t v) {
487
+ float32x4_t v_temp = v + vec_reve(v);
488
+ return v_temp[0] + v_temp[1];
489
+ }
490
+
491
+ inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492
+ int32x4_t v_temp = v + vec_reve(v);
493
+ return v_temp[0] + v_temp[1];
494
+ }
495
+
489
496
  inline static int32x4_t wsp_ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
490
497
  const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
491
498
  return acc + (vec_unpackh(p) + vec_unpackl(p));
@@ -373,6 +373,9 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
373
373
  .vec_dot_type = WSP_GGML_TYPE_Q8_K,
374
374
  .nrows = 1,
375
375
  },
376
+ [WSP_GGML_TYPE_I32] = {
377
+ .from_float = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_i32,
378
+ },
376
379
  };
377
380
 
378
381
  const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_ggml_type type) {
@@ -470,10 +473,10 @@ struct wsp_ggml_threadpool {
470
473
  struct wsp_ggml_compute_state {
471
474
  #ifndef WSP_GGML_USE_OPENMP
472
475
  wsp_ggml_thread_t thrd;
473
- bool cpumask[WSP_GGML_MAX_N_THREADS];
474
476
  int last_graph;
475
477
  bool pending;
476
478
  #endif
479
+ bool cpumask[WSP_GGML_MAX_N_THREADS];
477
480
  struct wsp_ggml_threadpool * threadpool;
478
481
  int ith;
479
482
  };
@@ -1876,10 +1879,18 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
1876
1879
  {
1877
1880
  wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
1878
1881
  } break;
1882
+ case WSP_GGML_OP_IM2COL_3D:
1883
+ {
1884
+ wsp_ggml_compute_forward_im2col_3d(params, tensor);
1885
+ } break;
1879
1886
  case WSP_GGML_OP_CONV_2D:
1880
1887
  {
1881
1888
  wsp_ggml_compute_forward_conv_2d(params, tensor);
1882
1889
  } break;
1890
+ case WSP_GGML_OP_CONV_3D:
1891
+ {
1892
+ wsp_ggml_compute_forward_conv_3d(params, tensor);
1893
+ } break;
1883
1894
  case WSP_GGML_OP_CONV_2D_DW:
1884
1895
  {
1885
1896
  wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2251,7 +2262,9 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
2251
2262
  } break;
2252
2263
  case WSP_GGML_OP_IM2COL:
2253
2264
  case WSP_GGML_OP_IM2COL_BACK:
2265
+ case WSP_GGML_OP_IM2COL_3D:
2254
2266
  case WSP_GGML_OP_CONV_2D:
2267
+ case WSP_GGML_OP_CONV_3D:
2255
2268
  case WSP_GGML_OP_CONV_2D_DW:
2256
2269
  case WSP_GGML_OP_CONV_TRANSPOSE_1D:
2257
2270
  case WSP_GGML_OP_CONV_TRANSPOSE_2D:
@@ -2686,7 +2699,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2686
2699
  if (wsp_ggml_is_quantized(node->type) ||
2687
2700
  // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
2688
2701
  (node->src[0]->type == WSP_GGML_TYPE_F16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
2689
- (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)) {
2702
+ (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16) ||
2703
+ // conversion between F32 and I32
2704
+ (node->src[0]->type == WSP_GGML_TYPE_F32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_I32) ||
2705
+ (node->src[0]->type == WSP_GGML_TYPE_I32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F32)) {
2690
2706
  cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
2691
2707
  }
2692
2708
  } break;
@@ -2773,6 +2789,7 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
2773
2789
  }
2774
2790
  } break;
2775
2791
  case WSP_GGML_OP_CONV_2D:
2792
+ case WSP_GGML_OP_CONV_3D:
2776
2793
  {
2777
2794
  cur = WSP_GGML_IM2COL_WORK_SIZE;
2778
2795
  } break;
@@ -3064,7 +3081,14 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
3064
3081
 
3065
3082
  threadpool->workers = workers;
3066
3083
 
3067
- #ifndef WSP_GGML_USE_OPENMP
3084
+ #ifdef WSP_GGML_USE_OPENMP
3085
+ int32_t cpumask_iter = 0;
3086
+
3087
+ // Compute CPU masks for each thread
3088
+ for (int j = 0; j < tpp->n_threads; j++) {
3089
+ wsp_ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3090
+ }
3091
+ #else // WSP_GGML_USE_OPENMP
3068
3092
  wsp_ggml_mutex_init(&threadpool->mutex);
3069
3093
  wsp_ggml_cond_init(&threadpool->cond);
3070
3094
 
@@ -3137,7 +3161,14 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
3137
3161
  atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3138
3162
  }
3139
3163
 
3140
- wsp_ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
3164
+ // Apply thread CPU mask and priority
3165
+ int ith = omp_get_thread_num();
3166
+
3167
+ wsp_ggml_thread_apply_priority(threadpool->prio);
3168
+ if (wsp_ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3169
+ wsp_ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3170
+ }
3171
+ wsp_ggml_graph_compute_thread(&threadpool->workers[ith]);
3141
3172
  }
3142
3173
  } else {
3143
3174
  atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -3200,20 +3231,12 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
3200
3231
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3201
3232
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3202
3233
  }
3203
- #elif defined(__NNPA__)
3204
- for (; i + 7 < n; i += 8) {
3205
- float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3206
- float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3207
- uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3208
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3209
- vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3210
- }
3211
- for (; i + 3 < n; i += 4) {
3212
- float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3213
- float32x4_t v_zero = vec_splats(0.0f);
3214
- uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3215
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3216
- vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
3234
+ #elif defined(__riscv_zvfh)
3235
+ for (int vl; i < n; i += vl) {
3236
+ vl = __riscv_vsetvl_e32m2(n - i);
3237
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3238
+ vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3239
+ __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3217
3240
  }
3218
3241
  #endif
3219
3242
  for (; i < n; ++i) {
@@ -3241,21 +3264,6 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
3241
3264
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3242
3265
  _mm_storeu_ps(y + i, y_vec);
3243
3266
  }
3244
- #elif defined(__NNPA__)
3245
- for (; i + 7 < n; i += 8) {
3246
- uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3247
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3248
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3249
- float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3250
- vec_xst(v_yh, 0, (float *)(y + i + 0));
3251
- vec_xst(v_yl, 0, (float *)(y + i + 4));
3252
- }
3253
- for (; i + 3 < n; i += 4) {
3254
- uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
3255
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3256
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3257
- vec_xst(v_yh, 0, (float *)(y + i));
3258
- }
3259
3267
  #endif
3260
3268
 
3261
3269
  for (; i < n; ++i) {
@@ -3270,6 +3278,13 @@ void wsp_ggml_cpu_fp32_to_bf16(const float * x, wsp_ggml_bf16_t * y, int64_t n)
3270
3278
  }
3271
3279
  }
3272
3280
 
3281
+ void wsp_ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3282
+ int64_t i = 0;
3283
+ for (; i < n; ++i) {
3284
+ y[i] = x[i];
3285
+ }
3286
+ }
3287
+
3273
3288
  void wsp_ggml_cpu_bf16_to_fp32(const wsp_ggml_bf16_t * x, float * y, int64_t n) {
3274
3289
  int64_t i = 0;
3275
3290
  #if defined(__AVX2__)
@@ -3459,14 +3474,6 @@ int wsp_ggml_cpu_has_vxe(void) {
3459
3474
  #endif
3460
3475
  }
3461
3476
 
3462
- int wsp_ggml_cpu_has_nnpa(void) {
3463
- #if defined(WSP_GGML_NNPA)
3464
- return 1;
3465
- #else
3466
- return 0;
3467
- #endif
3468
- }
3469
-
3470
3477
  int wsp_ggml_cpu_has_neon(void) {
3471
3478
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3472
3479
  return 1;
@@ -18,6 +18,10 @@
18
18
  # include "kleidiai/kleidiai.h"
19
19
  #endif
20
20
 
21
+ #ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
22
+ # include "spacemit/ime.h"
23
+ #endif
24
+
21
25
  #if defined(_WIN32)
22
26
  # define WIN32_LEAN_AND_MEAN
23
27
  # ifndef NOMINMAX
@@ -45,6 +49,12 @@ std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buf
45
49
  }
46
50
  #endif
47
51
 
52
+ #ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
53
+ if (wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
54
+ bufts.push_back(wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type());
55
+ }
56
+ #endif
57
+
48
58
  #ifdef WSP_GGML_USE_CPU_KLEIDIAI
49
59
  if (wsp_ggml_backend_cpu_kleidiai_buffer_type()) {
50
60
  bufts.push_back(wsp_ggml_backend_cpu_kleidiai_buffer_type());
@@ -190,6 +200,7 @@ static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
190
200
  /* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
191
201
  /* .event_record = */ NULL,
192
202
  /* .event_wait = */ NULL,
203
+ /* .graph_optimize = */ NULL,
193
204
  };
194
205
 
195
206
  static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
@@ -348,8 +359,10 @@ static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, s
348
359
  long pages = sysconf(_SC_PHYS_PAGES);
349
360
  long page_size = sysconf(_SC_PAGE_SIZE);
350
361
  *total = pages * page_size;
362
+
363
+ // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
351
364
  *free = *total;
352
- #endif
365
+ #endif // _WIN32
353
366
 
354
367
  WSP_GGML_UNUSED(dev);
355
368
  }
@@ -576,9 +589,6 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
576
589
  if (wsp_ggml_cpu_has_vxe()) {
577
590
  features.push_back({ "VXE", "1" });
578
591
  }
579
- if (wsp_ggml_cpu_has_nnpa()) {
580
- features.push_back({ "NNPA", "1" });
581
- }
582
592
  if (wsp_ggml_cpu_has_wasm_simd()) {
583
593
  features.push_back({ "WASM_SIMD", "1" });
584
594
  }