whisper.rn 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +264 -126
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +48 -41
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +518 -767
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/vec.cpp +161 -20
- package/cpp/ggml-cpu/vec.h +400 -51
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +43 -10
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +175 -13
- package/cpp/ggml.h +84 -5
- package/cpp/jsi/RNWhisperJSI.cpp +2 -0
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +85 -70
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-backend.h
CHANGED
|
@@ -132,6 +132,8 @@ extern "C" {
|
|
|
132
132
|
WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
133
|
// GPU device using dedicated memory
|
|
134
134
|
WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
135
|
+
// integrated GPU device using host memory
|
|
136
|
+
WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
|
|
135
137
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
138
|
WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
137
139
|
};
|
|
@@ -150,11 +152,21 @@ extern "C" {
|
|
|
150
152
|
|
|
151
153
|
// all the device properties
|
|
152
154
|
struct wsp_ggml_backend_dev_props {
|
|
155
|
+
// device name
|
|
153
156
|
const char * name;
|
|
157
|
+
// device description
|
|
154
158
|
const char * description;
|
|
159
|
+
// device free memory in bytes
|
|
155
160
|
size_t memory_free;
|
|
161
|
+
// device total memory in bytes
|
|
156
162
|
size_t memory_total;
|
|
163
|
+
// device type
|
|
157
164
|
enum wsp_ggml_backend_dev_type type;
|
|
165
|
+
// device id
|
|
166
|
+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
|
167
|
+
// if the id is unknown, this should be NULL
|
|
168
|
+
const char * device_id;
|
|
169
|
+
// device capabilities
|
|
158
170
|
struct wsp_ggml_backend_dev_caps caps;
|
|
159
171
|
};
|
|
160
172
|
|
|
@@ -302,11 +314,15 @@ extern "C" {
|
|
|
302
314
|
WSP_GGML_API int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
|
|
303
315
|
WSP_GGML_API int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
|
|
304
316
|
|
|
305
|
-
WSP_GGML_API
|
|
317
|
+
WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
|
|
318
|
+
WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
|
|
306
319
|
|
|
307
320
|
WSP_GGML_API void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
|
|
308
321
|
WSP_GGML_API wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
|
|
309
322
|
|
|
323
|
+
// Split graph without allocating it
|
|
324
|
+
WSP_GGML_API void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
|
|
325
|
+
|
|
310
326
|
// Allocate and compute graph on the backend scheduler
|
|
311
327
|
WSP_GGML_API bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
|
|
312
328
|
WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
|
package/cpp/ggml-cpu/amx/amx.cpp
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "traits.h"
|
|
9
9
|
|
|
10
|
-
#if defined(
|
|
10
|
+
#if defined(__linux__)
|
|
11
11
|
#include <sys/syscall.h>
|
|
12
12
|
#include <unistd.h>
|
|
13
13
|
#endif
|
|
@@ -186,7 +186,7 @@ static size_t wsp_ggml_backend_amx_buffer_type_get_alloc_size(wsp_ggml_backend_b
|
|
|
186
186
|
#define XFEATURE_XTILEDATA 18
|
|
187
187
|
|
|
188
188
|
static bool wsp_ggml_amx_init() {
|
|
189
|
-
#if defined(
|
|
189
|
+
#if defined(__linux__)
|
|
190
190
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
|
191
191
|
fprintf(stderr, "AMX is not ready to be used!\n");
|
|
192
192
|
return false;
|
|
@@ -194,6 +194,8 @@ static bool wsp_ggml_amx_init() {
|
|
|
194
194
|
return true;
|
|
195
195
|
#elif defined(_WIN32)
|
|
196
196
|
return true;
|
|
197
|
+
#else
|
|
198
|
+
return false;
|
|
197
199
|
#endif
|
|
198
200
|
}
|
|
199
201
|
|
|
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
878
878
|
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
|
|
879
879
|
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
|
|
880
880
|
|
|
881
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
881
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
882
882
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
883
883
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
884
884
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
1231
1231
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
|
1232
1232
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
|
1233
1233
|
|
|
1234
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
1234
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
1235
1235
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
1236
1236
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
1237
1237
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -73,7 +73,6 @@
|
|
|
73
73
|
#define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
|
|
74
74
|
#define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
|
|
75
75
|
#define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
|
|
76
|
-
#define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
|
|
77
76
|
// repack.cpp
|
|
78
77
|
#define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
|
|
79
78
|
#define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
|
|
@@ -151,8 +150,6 @@
|
|
|
151
150
|
#elif defined(__s390x__)
|
|
152
151
|
// quants.c
|
|
153
152
|
#define wsp_quantize_row_q8_K_generic wsp_quantize_row_q8_K
|
|
154
|
-
#define wsp_ggml_vec_dot_q5_0_q8_0_generic wsp_ggml_vec_dot_q5_0_q8_0
|
|
155
|
-
#define wsp_ggml_vec_dot_q5_1_q8_1_generic wsp_ggml_vec_dot_q5_1_q8_1
|
|
156
153
|
#define wsp_ggml_vec_dot_tq1_0_q8_K_generic wsp_ggml_vec_dot_tq1_0_q8_K
|
|
157
154
|
#define wsp_ggml_vec_dot_tq2_0_q8_K_generic wsp_ggml_vec_dot_tq2_0_q8_K
|
|
158
155
|
#define wsp_ggml_vec_dot_q2_K_q8_K_generic wsp_ggml_vec_dot_q2_K_q8_K
|
|
@@ -163,7 +160,6 @@
|
|
|
163
160
|
#define wsp_ggml_vec_dot_iq3_s_q8_K_generic wsp_ggml_vec_dot_iq3_s_q8_K
|
|
164
161
|
#define wsp_ggml_vec_dot_iq1_s_q8_K_generic wsp_ggml_vec_dot_iq1_s_q8_K
|
|
165
162
|
#define wsp_ggml_vec_dot_iq1_m_q8_K_generic wsp_ggml_vec_dot_iq1_m_q8_K
|
|
166
|
-
#define wsp_ggml_vec_dot_mxfp4_q8_0_generic wsp_ggml_vec_dot_mxfp4_q8_0
|
|
167
163
|
// repack.cpp
|
|
168
164
|
#define wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic wsp_ggml_wsp_quantize_mat_q8_0_4x4
|
|
169
165
|
#define wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic wsp_ggml_wsp_quantize_mat_q8_0_4x8
|
package/cpp/ggml-cpu/common.h
CHANGED
|
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(wsp_ggml_bf16_t x) {
|
|
|
28
28
|
return WSP_GGML_BF16_TO_FP32(x);
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
static inline float i32_to_f32(int32_t x) {
|
|
32
|
+
return x;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static inline int32_t f32_to_i32(float x) {
|
|
36
|
+
return x;
|
|
37
|
+
}
|
|
38
|
+
|
|
31
39
|
static inline float f32_to_f32(float x) {
|
|
32
40
|
return x;
|
|
33
41
|
}
|
|
@@ -54,6 +62,12 @@ struct type_conversion_table<wsp_ggml_bf16_t> {
|
|
|
54
62
|
static constexpr wsp_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
|
55
63
|
};
|
|
56
64
|
|
|
65
|
+
template <>
|
|
66
|
+
struct type_conversion_table<int32_t> {
|
|
67
|
+
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
|
|
68
|
+
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
|
|
69
|
+
};
|
|
70
|
+
|
|
57
71
|
static std::pair<int64_t, int64_t> get_thread_range(const struct wsp_ggml_compute_params * params, const struct wsp_ggml_tensor * src0) {
|
|
58
72
|
const int64_t ith = params->ith;
|
|
59
73
|
const int64_t nth = params->nth;
|
|
@@ -68,12 +68,6 @@ struct wsp_ggml_compute_params {
|
|
|
68
68
|
#endif // __VXE2__
|
|
69
69
|
#endif // __s390x__ && __VEC__
|
|
70
70
|
|
|
71
|
-
#if defined(__s390x__) && defined(WSP_GGML_NNPA)
|
|
72
|
-
#ifndef __NNPA__
|
|
73
|
-
#define __NNPA__
|
|
74
|
-
#endif // __NNPA__
|
|
75
|
-
#endif // __s390x__ && WSP_GGML_NNPA
|
|
76
|
-
|
|
77
71
|
#if defined(__ARM_FEATURE_SVE)
|
|
78
72
|
#include <sys/prctl.h>
|
|
79
73
|
#endif
|
|
@@ -486,6 +480,19 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
486
480
|
return v_abo + v_abe;
|
|
487
481
|
}
|
|
488
482
|
|
|
483
|
+
/**
|
|
484
|
+
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
485
|
+
*/
|
|
486
|
+
inline static float vec_hsum_f32x4(float32x4_t v) {
|
|
487
|
+
float32x4_t v_temp = v + vec_reve(v);
|
|
488
|
+
return v_temp[0] + v_temp[1];
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
|
|
492
|
+
int32x4_t v_temp = v + vec_reve(v);
|
|
493
|
+
return v_temp[0] + v_temp[1];
|
|
494
|
+
}
|
|
495
|
+
|
|
489
496
|
inline static int32x4_t wsp_ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
490
497
|
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
|
|
491
498
|
return acc + (vec_unpackh(p) + vec_unpackl(p));
|
package/cpp/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -373,6 +373,9 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
|
|
|
373
373
|
.vec_dot_type = WSP_GGML_TYPE_Q8_K,
|
|
374
374
|
.nrows = 1,
|
|
375
375
|
},
|
|
376
|
+
[WSP_GGML_TYPE_I32] = {
|
|
377
|
+
.from_float = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_i32,
|
|
378
|
+
},
|
|
376
379
|
};
|
|
377
380
|
|
|
378
381
|
const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_ggml_type type) {
|
|
@@ -470,10 +473,10 @@ struct wsp_ggml_threadpool {
|
|
|
470
473
|
struct wsp_ggml_compute_state {
|
|
471
474
|
#ifndef WSP_GGML_USE_OPENMP
|
|
472
475
|
wsp_ggml_thread_t thrd;
|
|
473
|
-
bool cpumask[WSP_GGML_MAX_N_THREADS];
|
|
474
476
|
int last_graph;
|
|
475
477
|
bool pending;
|
|
476
478
|
#endif
|
|
479
|
+
bool cpumask[WSP_GGML_MAX_N_THREADS];
|
|
477
480
|
struct wsp_ggml_threadpool * threadpool;
|
|
478
481
|
int ith;
|
|
479
482
|
};
|
|
@@ -1876,10 +1879,18 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
|
|
|
1876
1879
|
{
|
|
1877
1880
|
wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
|
|
1878
1881
|
} break;
|
|
1882
|
+
case WSP_GGML_OP_IM2COL_3D:
|
|
1883
|
+
{
|
|
1884
|
+
wsp_ggml_compute_forward_im2col_3d(params, tensor);
|
|
1885
|
+
} break;
|
|
1879
1886
|
case WSP_GGML_OP_CONV_2D:
|
|
1880
1887
|
{
|
|
1881
1888
|
wsp_ggml_compute_forward_conv_2d(params, tensor);
|
|
1882
1889
|
} break;
|
|
1890
|
+
case WSP_GGML_OP_CONV_3D:
|
|
1891
|
+
{
|
|
1892
|
+
wsp_ggml_compute_forward_conv_3d(params, tensor);
|
|
1893
|
+
} break;
|
|
1883
1894
|
case WSP_GGML_OP_CONV_2D_DW:
|
|
1884
1895
|
{
|
|
1885
1896
|
wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
|
|
@@ -2251,7 +2262,9 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
|
|
|
2251
2262
|
} break;
|
|
2252
2263
|
case WSP_GGML_OP_IM2COL:
|
|
2253
2264
|
case WSP_GGML_OP_IM2COL_BACK:
|
|
2265
|
+
case WSP_GGML_OP_IM2COL_3D:
|
|
2254
2266
|
case WSP_GGML_OP_CONV_2D:
|
|
2267
|
+
case WSP_GGML_OP_CONV_3D:
|
|
2255
2268
|
case WSP_GGML_OP_CONV_2D_DW:
|
|
2256
2269
|
case WSP_GGML_OP_CONV_TRANSPOSE_1D:
|
|
2257
2270
|
case WSP_GGML_OP_CONV_TRANSPOSE_2D:
|
|
@@ -2686,7 +2699,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
|
|
|
2686
2699
|
if (wsp_ggml_is_quantized(node->type) ||
|
|
2687
2700
|
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
|
|
2688
2701
|
(node->src[0]->type == WSP_GGML_TYPE_F16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
|
|
2689
|
-
(node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)
|
|
2702
|
+
(node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16) ||
|
|
2703
|
+
// conversion between F32 and I32
|
|
2704
|
+
(node->src[0]->type == WSP_GGML_TYPE_F32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_I32) ||
|
|
2705
|
+
(node->src[0]->type == WSP_GGML_TYPE_I32 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F32)) {
|
|
2690
2706
|
cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
|
2691
2707
|
}
|
|
2692
2708
|
} break;
|
|
@@ -2773,6 +2789,7 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
|
|
|
2773
2789
|
}
|
|
2774
2790
|
} break;
|
|
2775
2791
|
case WSP_GGML_OP_CONV_2D:
|
|
2792
|
+
case WSP_GGML_OP_CONV_3D:
|
|
2776
2793
|
{
|
|
2777
2794
|
cur = WSP_GGML_IM2COL_WORK_SIZE;
|
|
2778
2795
|
} break;
|
|
@@ -3064,7 +3081,14 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
|
|
|
3064
3081
|
|
|
3065
3082
|
threadpool->workers = workers;
|
|
3066
3083
|
|
|
3067
|
-
#
|
|
3084
|
+
#ifdef WSP_GGML_USE_OPENMP
|
|
3085
|
+
int32_t cpumask_iter = 0;
|
|
3086
|
+
|
|
3087
|
+
// Compute CPU masks for each thread
|
|
3088
|
+
for (int j = 0; j < tpp->n_threads; j++) {
|
|
3089
|
+
wsp_ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
|
3090
|
+
}
|
|
3091
|
+
#else // WSP_GGML_USE_OPENMP
|
|
3068
3092
|
wsp_ggml_mutex_init(&threadpool->mutex);
|
|
3069
3093
|
wsp_ggml_cond_init(&threadpool->cond);
|
|
3070
3094
|
|
|
@@ -3137,7 +3161,14 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
|
|
|
3137
3161
|
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3138
3162
|
}
|
|
3139
3163
|
|
|
3140
|
-
|
|
3164
|
+
// Apply thread CPU mask and priority
|
|
3165
|
+
int ith = omp_get_thread_num();
|
|
3166
|
+
|
|
3167
|
+
wsp_ggml_thread_apply_priority(threadpool->prio);
|
|
3168
|
+
if (wsp_ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
|
3169
|
+
wsp_ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
|
3170
|
+
}
|
|
3171
|
+
wsp_ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3141
3172
|
}
|
|
3142
3173
|
} else {
|
|
3143
3174
|
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
|
@@ -3200,20 +3231,12 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
|
|
|
3200
3231
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3201
3232
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3202
3233
|
}
|
|
3203
|
-
#elif defined(
|
|
3204
|
-
for (; i
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
|
|
3210
|
-
}
|
|
3211
|
-
for (; i + 3 < n; i += 4) {
|
|
3212
|
-
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3213
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
3214
|
-
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3215
|
-
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3216
|
-
vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
|
|
3234
|
+
#elif defined(__riscv_zvfh)
|
|
3235
|
+
for (int vl; i < n; i += vl) {
|
|
3236
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
3237
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
3238
|
+
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
|
|
3239
|
+
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
|
|
3217
3240
|
}
|
|
3218
3241
|
#endif
|
|
3219
3242
|
for (; i < n; ++i) {
|
|
@@ -3241,21 +3264,6 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
|
|
|
3241
3264
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3242
3265
|
_mm_storeu_ps(y + i, y_vec);
|
|
3243
3266
|
}
|
|
3244
|
-
#elif defined(__NNPA__)
|
|
3245
|
-
for (; i + 7 < n; i += 8) {
|
|
3246
|
-
uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
|
|
3247
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3248
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3249
|
-
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3250
|
-
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3251
|
-
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3252
|
-
}
|
|
3253
|
-
for (; i + 3 < n; i += 4) {
|
|
3254
|
-
uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
|
|
3255
|
-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3256
|
-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3257
|
-
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3258
|
-
}
|
|
3259
3267
|
#endif
|
|
3260
3268
|
|
|
3261
3269
|
for (; i < n; ++i) {
|
|
@@ -3270,6 +3278,13 @@ void wsp_ggml_cpu_fp32_to_bf16(const float * x, wsp_ggml_bf16_t * y, int64_t n)
|
|
|
3270
3278
|
}
|
|
3271
3279
|
}
|
|
3272
3280
|
|
|
3281
|
+
void wsp_ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
|
|
3282
|
+
int64_t i = 0;
|
|
3283
|
+
for (; i < n; ++i) {
|
|
3284
|
+
y[i] = x[i];
|
|
3285
|
+
}
|
|
3286
|
+
}
|
|
3287
|
+
|
|
3273
3288
|
void wsp_ggml_cpu_bf16_to_fp32(const wsp_ggml_bf16_t * x, float * y, int64_t n) {
|
|
3274
3289
|
int64_t i = 0;
|
|
3275
3290
|
#if defined(__AVX2__)
|
|
@@ -3459,14 +3474,6 @@ int wsp_ggml_cpu_has_vxe(void) {
|
|
|
3459
3474
|
#endif
|
|
3460
3475
|
}
|
|
3461
3476
|
|
|
3462
|
-
int wsp_ggml_cpu_has_nnpa(void) {
|
|
3463
|
-
#if defined(WSP_GGML_NNPA)
|
|
3464
|
-
return 1;
|
|
3465
|
-
#else
|
|
3466
|
-
return 0;
|
|
3467
|
-
#endif
|
|
3468
|
-
}
|
|
3469
|
-
|
|
3470
3477
|
int wsp_ggml_cpu_has_neon(void) {
|
|
3471
3478
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3472
3479
|
return 1;
|
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
# include "kleidiai/kleidiai.h"
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
+
#ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
|
|
22
|
+
# include "spacemit/ime.h"
|
|
23
|
+
#endif
|
|
24
|
+
|
|
21
25
|
#if defined(_WIN32)
|
|
22
26
|
# define WIN32_LEAN_AND_MEAN
|
|
23
27
|
# ifndef NOMINMAX
|
|
@@ -45,6 +49,12 @@ std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buf
|
|
|
45
49
|
}
|
|
46
50
|
#endif
|
|
47
51
|
|
|
52
|
+
#ifdef WSP_GGML_USE_CPU_RISCV64_SPACEMIT
|
|
53
|
+
if (wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
|
|
54
|
+
bufts.push_back(wsp_ggml_backend_cpu_riscv64_spacemit_buffer_type());
|
|
55
|
+
}
|
|
56
|
+
#endif
|
|
57
|
+
|
|
48
58
|
#ifdef WSP_GGML_USE_CPU_KLEIDIAI
|
|
49
59
|
if (wsp_ggml_backend_cpu_kleidiai_buffer_type()) {
|
|
50
60
|
bufts.push_back(wsp_ggml_backend_cpu_kleidiai_buffer_type());
|
|
@@ -190,6 +200,7 @@ static const struct wsp_ggml_backend_i wsp_ggml_backend_cpu_i = {
|
|
|
190
200
|
/* .graph_compute = */ wsp_ggml_backend_cpu_graph_compute,
|
|
191
201
|
/* .event_record = */ NULL,
|
|
192
202
|
/* .event_wait = */ NULL,
|
|
203
|
+
/* .graph_optimize = */ NULL,
|
|
193
204
|
};
|
|
194
205
|
|
|
195
206
|
static wsp_ggml_guid_t wsp_ggml_backend_cpu_guid(void) {
|
|
@@ -348,8 +359,10 @@ static void wsp_ggml_backend_cpu_device_get_memory(wsp_ggml_backend_dev_t dev, s
|
|
|
348
359
|
long pages = sysconf(_SC_PHYS_PAGES);
|
|
349
360
|
long page_size = sysconf(_SC_PAGE_SIZE);
|
|
350
361
|
*total = pages * page_size;
|
|
362
|
+
|
|
363
|
+
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
|
|
351
364
|
*free = *total;
|
|
352
|
-
#endif
|
|
365
|
+
#endif // _WIN32
|
|
353
366
|
|
|
354
367
|
WSP_GGML_UNUSED(dev);
|
|
355
368
|
}
|
|
@@ -576,9 +589,6 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
|
|
|
576
589
|
if (wsp_ggml_cpu_has_vxe()) {
|
|
577
590
|
features.push_back({ "VXE", "1" });
|
|
578
591
|
}
|
|
579
|
-
if (wsp_ggml_cpu_has_nnpa()) {
|
|
580
|
-
features.push_back({ "NNPA", "1" });
|
|
581
|
-
}
|
|
582
592
|
if (wsp_ggml_cpu_has_wasm_simd()) {
|
|
583
593
|
features.push_back({ "WASM_SIMD", "1" });
|
|
584
594
|
}
|