whisper.rn 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +24 -18
- package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +1 -57
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/cpp/ggml-backend.cpp +36 -18
- package/cpp/ggml-backend.h +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/cpp/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/ggml-cpu/common.h +3 -2
- package/cpp/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/ggml-cpu/ggml-cpu.c +95 -17
- package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/ggml-cpu/ops.cpp +775 -74
- package/cpp/ggml-cpu/ops.h +7 -0
- package/cpp/ggml-cpu/quants.c +25 -24
- package/cpp/ggml-cpu/repack.cpp +15 -14
- package/cpp/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/ggml-cpu/vec.cpp +26 -2
- package/cpp/ggml-cpu/vec.h +99 -45
- package/cpp/ggml-cpu.h +2 -0
- package/cpp/ggml-impl.h +125 -183
- package/cpp/ggml-metal-impl.h +27 -0
- package/cpp/ggml-metal.m +298 -41
- package/cpp/ggml-quants.c +6 -6
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +269 -40
- package/cpp/ggml.h +122 -2
- package/cpp/gguf.cpp +5 -1
- package/cpp/whisper.cpp +4 -0
- package/cpp/whisper.h +2 -0
- package/ios/RNWhisper.mm +35 -38
- package/ios/RNWhisperVadContext.h +1 -1
- package/ios/RNWhisperVadContext.mm +2 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/package.json +1 -1
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include <cmath>
|
|
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(wsp_ggml_fp16_t *x, wsp_ggml_fp16_t
|
|
|
39
40
|
float tmp[16];
|
|
40
41
|
|
|
41
42
|
for (int i = 0; i < 8; i++) {
|
|
42
|
-
tmp[i] =
|
|
43
|
+
tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
43
44
|
}
|
|
44
45
|
|
|
45
46
|
for (int i = 0; i < 8; i++) {
|
|
46
|
-
tmp[i + 8] =
|
|
47
|
+
tmp[i + 8] = WSP_GGML_CPU_FP16_TO_FP32(y[i]);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
return _mm512_loadu_ps(tmp);
|
|
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
|
|
|
54
55
|
_mm_storeu_si128((__m128i*)tmphalf, x);
|
|
55
56
|
|
|
56
57
|
for (int i = 0; i < 4; i++) {
|
|
57
|
-
tmp[i] =
|
|
58
|
-
tmp[i + 4] =
|
|
59
|
-
tmp[i + 8] =
|
|
60
|
-
tmp[i + 12] =
|
|
58
|
+
tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
59
|
+
tmp[i + 4] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
60
|
+
tmp[i + 8] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
|
+
tmp[i + 12] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
return _mm512_loadu_ps(tmp);
|
|
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(wsp_ggml_fp16_t *x) {
|
|
|
67
68
|
float tmp[8];
|
|
68
69
|
|
|
69
70
|
for (int i = 0; i < 8; i++) {
|
|
70
|
-
tmp[i] =
|
|
71
|
+
tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
return _mm256_loadu_ps(tmp);
|
|
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(wsp_ggml_fp16_t *x) {
|
|
|
76
77
|
float tmp[8];
|
|
77
78
|
|
|
78
79
|
for (int i = 0; i < 4; i++) {
|
|
79
|
-
tmp[i] =
|
|
80
|
-
tmp[i + 4] =
|
|
80
|
+
tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
|
+
tmp[i + 4] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
82
|
}
|
|
82
83
|
|
|
83
84
|
return _mm256_loadu_ps(tmp);
|
|
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(wsp_ggml_fp16_t *x, __m128i ar
|
|
|
88
89
|
|
|
89
90
|
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
|
|
90
91
|
for (int i = 0; i < 8; i++) {
|
|
91
|
-
tmp[i] =
|
|
92
|
+
tmp[i] = WSP_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
92
93
|
}
|
|
93
94
|
|
|
94
95
|
return _mm256_loadu_ps(tmp);
|
|
@@ -211,7 +212,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x8(const float * WSP_GGML_RESTRICT x, void
|
|
|
211
212
|
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
|
|
212
213
|
|
|
213
214
|
// Store the scale for the individual block
|
|
214
|
-
y[i].d[row_iter] =
|
|
215
|
+
y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
|
|
215
216
|
|
|
216
217
|
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
|
|
217
218
|
srcv[row_iter][0] = v0;
|
|
@@ -297,7 +298,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x8(const float * WSP_GGML_RESTRICT x, void
|
|
|
297
298
|
const float d = amax / ((1 << 7) - 1);
|
|
298
299
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
299
300
|
|
|
300
|
-
y[i].d[row_iter] =
|
|
301
|
+
y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
|
|
301
302
|
}
|
|
302
303
|
|
|
303
304
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -647,7 +648,7 @@ void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
647
648
|
const __m256 col_scale_f32 = WSP_GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
|
648
649
|
|
|
649
650
|
// Load and convert to FP32 scale from block_q8_0
|
|
650
|
-
const __m256 row_scale_f32 = _mm256_set1_ps(
|
|
651
|
+
const __m256 row_scale_f32 = _mm256_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
|
|
651
652
|
|
|
652
653
|
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
|
|
653
654
|
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
|
|
@@ -706,7 +707,7 @@ void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
706
707
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
707
708
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
708
709
|
}
|
|
709
|
-
sumf[j] += sumi *
|
|
710
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
710
711
|
}
|
|
711
712
|
}
|
|
712
713
|
}
|
|
@@ -972,13 +973,13 @@ void wsp_ggml_gemv_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
972
973
|
sumi2 = sumi2 * scales_1[j];
|
|
973
974
|
sumi += sumi1 + sumi2;
|
|
974
975
|
}
|
|
975
|
-
sumf[j] += sumi *
|
|
976
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
976
977
|
}
|
|
977
978
|
}
|
|
978
979
|
for (int sb = 0; sb < 8; sb++) {
|
|
979
980
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
980
981
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
981
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
982
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
982
983
|
}
|
|
983
984
|
}
|
|
984
985
|
}
|
|
@@ -1755,7 +1756,7 @@ void wsp_ggml_gemm_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
1755
1756
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1756
1757
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1757
1758
|
}
|
|
1758
|
-
sumf[m][j] += sumi *
|
|
1759
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1759
1760
|
}
|
|
1760
1761
|
}
|
|
1761
1762
|
}
|
|
@@ -3259,7 +3260,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3259
3260
|
sumi2 = sumi2 * scales_1[j];
|
|
3260
3261
|
sumi += sumi1 + sumi2;
|
|
3261
3262
|
}
|
|
3262
|
-
sumf[m][j] += sumi *
|
|
3263
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
3263
3264
|
}
|
|
3264
3265
|
}
|
|
3265
3266
|
}
|
|
@@ -3268,7 +3269,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
3268
3269
|
for(int m = 0; m < 4; m++) {
|
|
3269
3270
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
3270
3271
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
3271
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
3272
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
3272
3273
|
}
|
|
3273
3274
|
}
|
|
3274
3275
|
}
|
package/cpp/ggml-cpu/common.h
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "traits.h"
|
|
5
5
|
#include "ggml-cpu-impl.h"
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
|
+
#include "simd-mappings.h"
|
|
7
8
|
|
|
8
9
|
#ifdef __cplusplus
|
|
9
10
|
|
|
@@ -12,11 +13,11 @@
|
|
|
12
13
|
// convenience functions/macros for use in template calls
|
|
13
14
|
// note: these won't be required after the 'traits' lookup table is used.
|
|
14
15
|
static inline wsp_ggml_fp16_t f32_to_f16(float x) {
|
|
15
|
-
return
|
|
16
|
+
return WSP_GGML_CPU_FP32_TO_FP16(x);
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
static inline float f16_to_f32(wsp_ggml_fp16_t x) {
|
|
19
|
-
return
|
|
20
|
+
return WSP_GGML_CPU_FP16_TO_FP32(x);
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
static inline wsp_ggml_bf16_t f32_to_bf16(float x) {
|
|
@@ -62,11 +62,17 @@ struct wsp_ggml_compute_params {
|
|
|
62
62
|
#if defined(__s390x__) && defined(__VEC__)
|
|
63
63
|
#ifndef __VXE__
|
|
64
64
|
#define __VXE__
|
|
65
|
-
#endif
|
|
65
|
+
#endif // __VXE__
|
|
66
66
|
#ifndef __VXE2__
|
|
67
67
|
#define __VXE2__
|
|
68
|
-
#endif
|
|
69
|
-
#endif
|
|
68
|
+
#endif // __VXE2__
|
|
69
|
+
#endif // __s390x__ && __VEC__
|
|
70
|
+
|
|
71
|
+
#if defined(__s390x__) && defined(WSP_GGML_NNPA)
|
|
72
|
+
#ifndef __NNPA__
|
|
73
|
+
#define __NNPA__
|
|
74
|
+
#endif // __NNPA__
|
|
75
|
+
#endif // __s390x__ && WSP_GGML_NNPA
|
|
70
76
|
|
|
71
77
|
#if defined(__ARM_FEATURE_SVE)
|
|
72
78
|
#include <sys/prctl.h>
|
package/cpp/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -72,6 +72,9 @@
|
|
|
72
72
|
#define UNUSED WSP_GGML_UNUSED
|
|
73
73
|
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
|
|
74
74
|
|
|
75
|
+
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
|
76
|
+
float wsp_ggml_table_f32_f16[1 << 16];
|
|
77
|
+
|
|
75
78
|
#if defined(__ARM_ARCH)
|
|
76
79
|
struct wsp_ggml_arm_arch_features_type {
|
|
77
80
|
int sve_cnt;
|
|
@@ -192,6 +195,7 @@ typedef pthread_t wsp_ggml_thread_t;
|
|
|
192
195
|
|
|
193
196
|
static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT] = {
|
|
194
197
|
[WSP_GGML_TYPE_F32] = {
|
|
198
|
+
.from_float = (wsp_ggml_from_float_t) wsp_ggml_cpu_fp32_to_fp32,
|
|
195
199
|
.vec_dot = (wsp_ggml_vec_dot_t) wsp_ggml_vec_dot_f32,
|
|
196
200
|
.vec_dot_type = WSP_GGML_TYPE_F32,
|
|
197
201
|
.nrows = 1,
|
|
@@ -736,7 +740,7 @@ struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int3
|
|
|
736
740
|
{
|
|
737
741
|
assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
|
|
738
742
|
for (int i = 0; i < n; i++) {
|
|
739
|
-
wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1),
|
|
743
|
+
wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_CPU_FP32_TO_FP16(value));
|
|
740
744
|
}
|
|
741
745
|
} break;
|
|
742
746
|
case WSP_GGML_TYPE_BF16:
|
|
@@ -795,7 +799,7 @@ struct wsp_ggml_tensor * wsp_ggml_set_f32(struct wsp_ggml_tensor * tensor, float
|
|
|
795
799
|
{
|
|
796
800
|
assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
|
|
797
801
|
for (int i = 0; i < n; i++) {
|
|
798
|
-
wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1),
|
|
802
|
+
wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), WSP_GGML_CPU_FP32_TO_FP16(value));
|
|
799
803
|
}
|
|
800
804
|
} break;
|
|
801
805
|
case WSP_GGML_TYPE_BF16:
|
|
@@ -846,7 +850,7 @@ int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i) {
|
|
|
846
850
|
case WSP_GGML_TYPE_F16:
|
|
847
851
|
{
|
|
848
852
|
WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
|
|
849
|
-
return
|
|
853
|
+
return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
|
|
850
854
|
}
|
|
851
855
|
case WSP_GGML_TYPE_BF16:
|
|
852
856
|
{
|
|
@@ -891,7 +895,7 @@ void wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t v
|
|
|
891
895
|
case WSP_GGML_TYPE_F16:
|
|
892
896
|
{
|
|
893
897
|
WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t));
|
|
894
|
-
((wsp_ggml_fp16_t *)(tensor->data))[i] =
|
|
898
|
+
((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_CPU_FP32_TO_FP16(value);
|
|
895
899
|
} break;
|
|
896
900
|
case WSP_GGML_TYPE_BF16:
|
|
897
901
|
{
|
|
@@ -920,7 +924,7 @@ int32_t wsp_ggml_get_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i
|
|
|
920
924
|
case WSP_GGML_TYPE_I32:
|
|
921
925
|
return ((int32_t *) data)[0];
|
|
922
926
|
case WSP_GGML_TYPE_F16:
|
|
923
|
-
return
|
|
927
|
+
return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
|
|
924
928
|
case WSP_GGML_TYPE_BF16:
|
|
925
929
|
return WSP_GGML_BF16_TO_FP32(((wsp_ggml_bf16_t *) data)[0]);
|
|
926
930
|
case WSP_GGML_TYPE_F32:
|
|
@@ -947,7 +951,7 @@ void wsp_ggml_set_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
|
|
|
947
951
|
} break;
|
|
948
952
|
case WSP_GGML_TYPE_F16:
|
|
949
953
|
{
|
|
950
|
-
((wsp_ggml_fp16_t *)(data))[0] =
|
|
954
|
+
((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_CPU_FP32_TO_FP16(value);
|
|
951
955
|
} break;
|
|
952
956
|
case WSP_GGML_TYPE_BF16:
|
|
953
957
|
{
|
|
@@ -985,7 +989,7 @@ float wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i) {
|
|
|
985
989
|
}
|
|
986
990
|
case WSP_GGML_TYPE_F16:
|
|
987
991
|
{
|
|
988
|
-
return
|
|
992
|
+
return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]);
|
|
989
993
|
}
|
|
990
994
|
case WSP_GGML_TYPE_BF16:
|
|
991
995
|
{
|
|
@@ -1024,7 +1028,7 @@ void wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float val
|
|
|
1024
1028
|
} break;
|
|
1025
1029
|
case WSP_GGML_TYPE_F16:
|
|
1026
1030
|
{
|
|
1027
|
-
((wsp_ggml_fp16_t *)(tensor->data))[i] =
|
|
1031
|
+
((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_CPU_FP32_TO_FP16(value);
|
|
1028
1032
|
} break;
|
|
1029
1033
|
case WSP_GGML_TYPE_BF16:
|
|
1030
1034
|
{
|
|
@@ -1051,7 +1055,7 @@ float wsp_ggml_get_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
|
|
|
1051
1055
|
case WSP_GGML_TYPE_I32:
|
|
1052
1056
|
return ((int32_t *) data)[0];
|
|
1053
1057
|
case WSP_GGML_TYPE_F16:
|
|
1054
|
-
return
|
|
1058
|
+
return WSP_GGML_CPU_FP16_TO_FP32(((wsp_ggml_fp16_t *) data)[0]);
|
|
1055
1059
|
case WSP_GGML_TYPE_BF16:
|
|
1056
1060
|
return WSP_GGML_BF16_TO_FP32(((wsp_ggml_bf16_t *) data)[0]);
|
|
1057
1061
|
case WSP_GGML_TYPE_F32:
|
|
@@ -1078,7 +1082,7 @@ void wsp_ggml_set_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1,
|
|
|
1078
1082
|
} break;
|
|
1079
1083
|
case WSP_GGML_TYPE_F16:
|
|
1080
1084
|
{
|
|
1081
|
-
((wsp_ggml_fp16_t *)(data))[0] =
|
|
1085
|
+
((wsp_ggml_fp16_t *)(data))[0] = WSP_GGML_CPU_FP32_TO_FP16(value);
|
|
1082
1086
|
} break;
|
|
1083
1087
|
case WSP_GGML_TYPE_BF16:
|
|
1084
1088
|
{
|
|
@@ -1189,7 +1193,7 @@ static void wsp_ggml_compute_forward_mul_mat_one_chunk(
|
|
|
1189
1193
|
}
|
|
1190
1194
|
}
|
|
1191
1195
|
|
|
1192
|
-
|
|
1196
|
+
void wsp_ggml_compute_forward_mul_mat(
|
|
1193
1197
|
const struct wsp_ggml_compute_params * params,
|
|
1194
1198
|
struct wsp_ggml_tensor * dst) {
|
|
1195
1199
|
|
|
@@ -1814,6 +1818,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
|
|
|
1814
1818
|
{
|
|
1815
1819
|
wsp_ggml_compute_forward_get_rows_back(params, tensor);
|
|
1816
1820
|
} break;
|
|
1821
|
+
case WSP_GGML_OP_SET_ROWS:
|
|
1822
|
+
{
|
|
1823
|
+
wsp_ggml_compute_forward_set_rows(params, tensor);
|
|
1824
|
+
} break;
|
|
1817
1825
|
case WSP_GGML_OP_DIAG:
|
|
1818
1826
|
{
|
|
1819
1827
|
wsp_ggml_compute_forward_diag(params, tensor);
|
|
@@ -1858,6 +1866,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
|
|
|
1858
1866
|
{
|
|
1859
1867
|
wsp_ggml_compute_forward_im2col_back_f32(params, tensor);
|
|
1860
1868
|
} break;
|
|
1869
|
+
case WSP_GGML_OP_CONV_2D:
|
|
1870
|
+
{
|
|
1871
|
+
wsp_ggml_compute_forward_conv_2d(params, tensor);
|
|
1872
|
+
} break;
|
|
1861
1873
|
case WSP_GGML_OP_CONV_2D_DW:
|
|
1862
1874
|
{
|
|
1863
1875
|
wsp_ggml_compute_forward_conv_2d_dw(params, tensor);
|
|
@@ -1941,6 +1953,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
|
|
|
1941
1953
|
{
|
|
1942
1954
|
wsp_ggml_compute_forward_unary(params, tensor);
|
|
1943
1955
|
} break;
|
|
1956
|
+
case WSP_GGML_OP_GLU:
|
|
1957
|
+
{
|
|
1958
|
+
wsp_ggml_compute_forward_glu(params, tensor);
|
|
1959
|
+
} break;
|
|
1944
1960
|
case WSP_GGML_OP_GET_REL_POS:
|
|
1945
1961
|
{
|
|
1946
1962
|
wsp_ggml_compute_forward_get_rel_pos(params, tensor);
|
|
@@ -2151,6 +2167,18 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
|
|
|
2151
2167
|
WSP_GGML_ABORT("fatal error");
|
|
2152
2168
|
}
|
|
2153
2169
|
break;
|
|
2170
|
+
case WSP_GGML_OP_GLU:
|
|
2171
|
+
switch (wsp_ggml_get_glu_op(node)) {
|
|
2172
|
+
case WSP_GGML_GLU_OP_REGLU:
|
|
2173
|
+
case WSP_GGML_GLU_OP_GEGLU:
|
|
2174
|
+
case WSP_GGML_GLU_OP_SWIGLU:
|
|
2175
|
+
{
|
|
2176
|
+
n_tasks = n_threads;
|
|
2177
|
+
} break;
|
|
2178
|
+
default:
|
|
2179
|
+
WSP_GGML_ABORT("fatal error");
|
|
2180
|
+
}
|
|
2181
|
+
break;
|
|
2154
2182
|
case WSP_GGML_OP_SILU_BACK:
|
|
2155
2183
|
case WSP_GGML_OP_MUL:
|
|
2156
2184
|
case WSP_GGML_OP_DIV:
|
|
@@ -2167,6 +2195,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
|
|
|
2167
2195
|
n_tasks = n_threads;
|
|
2168
2196
|
} break;
|
|
2169
2197
|
case WSP_GGML_OP_GET_ROWS:
|
|
2198
|
+
case WSP_GGML_OP_SET_ROWS:
|
|
2170
2199
|
{
|
|
2171
2200
|
// FIXME: get_rows can use additional threads, but the cost of launching additional threads
|
|
2172
2201
|
// decreases performance with GPU offloading
|
|
@@ -2203,6 +2232,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
|
|
|
2203
2232
|
} break;
|
|
2204
2233
|
case WSP_GGML_OP_IM2COL:
|
|
2205
2234
|
case WSP_GGML_OP_IM2COL_BACK:
|
|
2235
|
+
case WSP_GGML_OP_CONV_2D:
|
|
2206
2236
|
case WSP_GGML_OP_CONV_2D_DW:
|
|
2207
2237
|
case WSP_GGML_OP_CONV_TRANSPOSE_1D:
|
|
2208
2238
|
case WSP_GGML_OP_CONV_TRANSPOSE_2D:
|
|
@@ -2721,6 +2751,10 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
|
|
|
2721
2751
|
WSP_GGML_ABORT("fatal error");
|
|
2722
2752
|
}
|
|
2723
2753
|
} break;
|
|
2754
|
+
case WSP_GGML_OP_CONV_2D:
|
|
2755
|
+
{
|
|
2756
|
+
cur = WSP_GGML_IM2COL_WORK_SIZE;
|
|
2757
|
+
} break;
|
|
2724
2758
|
case WSP_GGML_OP_CONV_TRANSPOSE_2D:
|
|
2725
2759
|
{
|
|
2726
2760
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
|
@@ -3121,6 +3155,10 @@ enum wsp_ggml_status wsp_ggml_graph_compute_with_ctx(struct wsp_ggml_context * c
|
|
|
3121
3155
|
return wsp_ggml_graph_compute(cgraph, &cplan);
|
|
3122
3156
|
}
|
|
3123
3157
|
|
|
3158
|
+
void wsp_ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
|
|
3159
|
+
memcpy(y, x, n * sizeof(float));
|
|
3160
|
+
}
|
|
3161
|
+
|
|
3124
3162
|
void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n) {
|
|
3125
3163
|
int64_t i = 0;
|
|
3126
3164
|
#if defined(__F16C__)
|
|
@@ -3141,9 +3179,24 @@ void wsp_ggml_cpu_fp32_to_fp16(const float * x, wsp_ggml_fp16_t * y, int64_t n)
|
|
|
3141
3179
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3142
3180
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3143
3181
|
}
|
|
3182
|
+
#elif defined(__NNPA__)
|
|
3183
|
+
for (; i + 7 < n; i += 8) {
|
|
3184
|
+
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
|
3185
|
+
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
|
3186
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
|
3187
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3188
|
+
vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
|
|
3189
|
+
}
|
|
3190
|
+
for (; i + 3 < n; i += 4) {
|
|
3191
|
+
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3192
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
3193
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3194
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3195
|
+
vec_xst(v_y, 0, (wsp_ggml_fp16_t *)(y + i));
|
|
3196
|
+
}
|
|
3144
3197
|
#endif
|
|
3145
3198
|
for (; i < n; ++i) {
|
|
3146
|
-
y[i] =
|
|
3199
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16(x[i]);
|
|
3147
3200
|
}
|
|
3148
3201
|
}
|
|
3149
3202
|
|
|
@@ -3167,9 +3220,25 @@ void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t * x, float * y, int64_t n)
|
|
|
3167
3220
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3168
3221
|
_mm_storeu_ps(y + i, y_vec);
|
|
3169
3222
|
}
|
|
3223
|
+
#elif defined(__NNPA__)
|
|
3224
|
+
for (; i + 7 < n; i += 8) {
|
|
3225
|
+
uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
|
|
3226
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3227
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3228
|
+
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3229
|
+
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3230
|
+
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3231
|
+
}
|
|
3232
|
+
for (; i + 3 < n; i += 4) {
|
|
3233
|
+
uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)(x + i));
|
|
3234
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3235
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3236
|
+
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3237
|
+
}
|
|
3170
3238
|
#endif
|
|
3239
|
+
|
|
3171
3240
|
for (; i < n; ++i) {
|
|
3172
|
-
y[i] =
|
|
3241
|
+
y[i] = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
3173
3242
|
}
|
|
3174
3243
|
}
|
|
3175
3244
|
|
|
@@ -3369,6 +3438,14 @@ int wsp_ggml_cpu_has_vxe(void) {
|
|
|
3369
3438
|
#endif
|
|
3370
3439
|
}
|
|
3371
3440
|
|
|
3441
|
+
int wsp_ggml_cpu_has_nnpa(void) {
|
|
3442
|
+
#if defined(WSP_GGML_NNPA)
|
|
3443
|
+
return 1;
|
|
3444
|
+
#else
|
|
3445
|
+
return 0;
|
|
3446
|
+
#endif
|
|
3447
|
+
}
|
|
3448
|
+
|
|
3372
3449
|
int wsp_ggml_cpu_has_neon(void) {
|
|
3373
3450
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3374
3451
|
return 1;
|
|
@@ -3418,7 +3495,7 @@ int wsp_ggml_cpu_has_sme(void) {
|
|
|
3418
3495
|
}
|
|
3419
3496
|
|
|
3420
3497
|
void wsp_ggml_cpu_init(void) {
|
|
3421
|
-
// needed to initialize
|
|
3498
|
+
// needed to initialize wsp_ggml_time
|
|
3422
3499
|
{
|
|
3423
3500
|
struct wsp_ggml_init_params params = { 0, NULL, false };
|
|
3424
3501
|
struct wsp_ggml_context * ctx = wsp_ggml_init(params);
|
|
@@ -3439,9 +3516,10 @@ void wsp_ggml_cpu_init(void) {
|
|
|
3439
3516
|
uint16_t u16;
|
|
3440
3517
|
wsp_ggml_fp16_t fp16;
|
|
3441
3518
|
} u = {i};
|
|
3442
|
-
float f =
|
|
3443
|
-
|
|
3444
|
-
|
|
3519
|
+
float f = WSP_GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
|
3520
|
+
wsp_ggml_table_f32_f16[i] = f;
|
|
3521
|
+
wsp_ggml_table_gelu_f16[i] = WSP_GGML_CPU_FP32_TO_FP16(wsp_ggml_gelu_f32(f));
|
|
3522
|
+
wsp_ggml_table_gelu_quick_f16[i] = WSP_GGML_CPU_FP32_TO_FP16(wsp_ggml_gelu_quick_f32(f));
|
|
3445
3523
|
}
|
|
3446
3524
|
|
|
3447
3525
|
const uint64_t t_end = wsp_ggml_time_us(); UNUSED(t_end);
|
|
@@ -416,6 +416,7 @@ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev,
|
|
|
416
416
|
|
|
417
417
|
switch (op->op) {
|
|
418
418
|
case WSP_GGML_OP_CPY:
|
|
419
|
+
case WSP_GGML_OP_SET_ROWS:
|
|
419
420
|
return
|
|
420
421
|
op->type != WSP_GGML_TYPE_IQ3_XXS &&
|
|
421
422
|
op->type != WSP_GGML_TYPE_IQ3_S &&
|
|
@@ -578,6 +579,9 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
|
|
|
578
579
|
if (wsp_ggml_cpu_has_vxe()) {
|
|
579
580
|
features.push_back({ "VXE", "1" });
|
|
580
581
|
}
|
|
582
|
+
if (wsp_ggml_cpu_has_nnpa()) {
|
|
583
|
+
features.push_back({ "NNPA", "1" });
|
|
584
|
+
}
|
|
581
585
|
if (wsp_ggml_cpu_has_wasm_simd()) {
|
|
582
586
|
features.push_back({ "WASM_SIMD", "1" });
|
|
583
587
|
}
|