whisper.rn 0.5.0-rc.9 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +265 -141
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +30 -13
- package/cpp/ggml-backend.cpp +221 -38
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-common.h +17 -0
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/ggml-cpu/arch-fallback.h +32 -2
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +70 -42
- package/cpp/ggml-cpu/ggml-cpu.cpp +35 -28
- package/cpp/ggml-cpu/ops.cpp +1587 -1177
- package/cpp/ggml-cpu/ops.h +5 -8
- package/cpp/ggml-cpu/quants.c +35 -0
- package/cpp/ggml-cpu/quants.h +8 -0
- package/cpp/ggml-cpu/repack.cpp +458 -47
- package/cpp/ggml-cpu/repack.h +22 -0
- package/cpp/ggml-cpu/simd-mappings.h +89 -60
- package/cpp/ggml-cpu/traits.cpp +2 -2
- package/cpp/ggml-cpu/traits.h +1 -1
- package/cpp/ggml-cpu/vec.cpp +170 -26
- package/cpp/ggml-cpu/vec.h +506 -63
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +119 -9
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +90 -51
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-opt.cpp +97 -41
- package/cpp/ggml-opt.h +25 -6
- package/cpp/ggml-quants.c +111 -16
- package/cpp/ggml-quants.h +6 -0
- package/cpp/ggml.c +486 -98
- package/cpp/ggml.h +221 -16
- package/cpp/gguf.cpp +8 -1
- package/cpp/jsi/RNWhisperJSI.cpp +25 -6
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +100 -76
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisper.mm +6 -6
- package/ios/RNWhisperContext.mm +2 -0
- package/ios/RNWhisperVadContext.mm +16 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +13 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +13 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
- package/lib/typescript/realtime-transcription/types.d.ts +6 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/realtime-transcription/RealtimeTranscriber.ts +17 -0
- package/src/realtime-transcription/types.ts +6 -0
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6284
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-cpu/repack.h
CHANGED
|
@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
static_assert(sizeof(block_q4_Kx8) == sizeof(wsp_ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
|
47
|
+
struct block_q2_Kx8 {
|
|
48
|
+
wsp_ggml_half d[8]; // super-block scale for quantized scales
|
|
49
|
+
wsp_ggml_half dmin[8]; // super-block scale for quantized mins
|
|
50
|
+
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
|
51
|
+
uint8_t qs[512]; // 2--bit quants
|
|
52
|
+
};
|
|
47
53
|
|
|
54
|
+
static_assert(sizeof(block_q2_Kx8) == sizeof(wsp_ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
|
48
55
|
struct block_q8_Kx4 {
|
|
49
56
|
float d[4]; // delta
|
|
50
57
|
int8_t qs[QK_K * 4]; // quants
|
|
@@ -60,6 +67,13 @@ struct block_iq4_nlx4 {
|
|
|
60
67
|
|
|
61
68
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(wsp_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
|
62
69
|
|
|
70
|
+
struct block_iq4_nlx8 {
|
|
71
|
+
wsp_ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
|
72
|
+
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(wsp_ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
|
76
|
+
|
|
63
77
|
#if defined(__cplusplus)
|
|
64
78
|
extern "C" {
|
|
65
79
|
#endif
|
|
@@ -71,12 +85,16 @@ void wsp_ggml_gemv_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
|
|
|
71
85
|
void wsp_ggml_gemv_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
72
86
|
void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
73
87
|
void wsp_ggml_gemv_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
88
|
+
void wsp_ggml_gemv_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
74
89
|
void wsp_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
90
|
+
void wsp_ggml_gemv_iq4_nl_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
75
91
|
void wsp_ggml_gemm_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
76
92
|
void wsp_ggml_gemm_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
77
93
|
void wsp_ggml_gemm_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
78
94
|
void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
95
|
+
void wsp_ggml_gemm_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
79
96
|
void wsp_ggml_gemm_iq4_nl_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
97
|
+
void wsp_ggml_gemm_iq4_nl_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
80
98
|
|
|
81
99
|
// Native implementations
|
|
82
100
|
void wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic(const float * WSP_GGML_RESTRICT x, void * WSP_GGML_RESTRICT vy, int64_t k);
|
|
@@ -86,12 +104,16 @@ void wsp_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
86
104
|
void wsp_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
87
105
|
void wsp_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
88
106
|
void wsp_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
107
|
+
void wsp_ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
89
108
|
void wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
109
|
+
void wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
90
110
|
void wsp_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
91
111
|
void wsp_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
92
112
|
void wsp_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
93
113
|
void wsp_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
114
|
+
void wsp_ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
94
115
|
void wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
116
|
+
void wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
|
|
95
117
|
|
|
96
118
|
#if defined(__cplusplus)
|
|
97
119
|
} // extern "C"
|
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
#include <immintrin.h>
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
+
#if defined(__riscv_v_intrinsic)
|
|
22
|
+
#include <riscv_vector.h>
|
|
23
|
+
#endif
|
|
24
|
+
|
|
21
25
|
#ifdef __cplusplus
|
|
22
26
|
extern "C" {
|
|
23
27
|
#endif
|
|
@@ -94,24 +98,15 @@ extern "C" {
|
|
|
94
98
|
}
|
|
95
99
|
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
|
96
100
|
static inline float riscv_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"fcvt.s.h %[f], %[f]"
|
|
101
|
-
: [f] "=&f" (f)
|
|
102
|
-
: [h] "r" (h)
|
|
103
|
-
);
|
|
104
|
-
return f;
|
|
101
|
+
_Float16 hf;
|
|
102
|
+
memcpy(&hf, &h, sizeof(wsp_ggml_fp16_t));
|
|
103
|
+
return hf;
|
|
105
104
|
}
|
|
106
105
|
|
|
107
106
|
static inline wsp_ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
|
108
107
|
wsp_ggml_fp16_t res;
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
"fmv.x.h %[h], %[f]"
|
|
112
|
-
: [h] "=&r" (res)
|
|
113
|
-
: [f] "f" (f)
|
|
114
|
-
);
|
|
108
|
+
_Float16 hf = (_Float16)f;
|
|
109
|
+
memcpy(&res, &hf, sizeof(wsp_ggml_fp16_t));
|
|
115
110
|
return res;
|
|
116
111
|
}
|
|
117
112
|
|
|
@@ -119,26 +114,6 @@ extern "C" {
|
|
|
119
114
|
#define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
120
115
|
#define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
121
116
|
#define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
122
|
-
#elif defined(__NNPA__)
|
|
123
|
-
#define WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
124
|
-
#define WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
125
|
-
|
|
126
|
-
#define WSP_GGML_CPU_FP16_TO_FP32(x) WSP_GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
127
|
-
#define WSP_GGML_CPU_FP32_TO_FP16(x) WSP_GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
128
|
-
|
|
129
|
-
static inline float nnpa_compute_fp16_to_fp32(wsp_ggml_fp16_t h) {
|
|
130
|
-
uint16x8_t v_h = vec_splats(h);
|
|
131
|
-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
132
|
-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
static inline wsp_ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
136
|
-
float32x4_t v_f = vec_splats(f);
|
|
137
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
138
|
-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
139
|
-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
140
|
-
return vec_extract(v_h, 0);
|
|
141
|
-
}
|
|
142
117
|
#endif
|
|
143
118
|
|
|
144
119
|
// precomputed f32 table for f16 (256 KB)
|
|
@@ -189,7 +164,7 @@ inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) {
|
|
|
189
164
|
#define WSP_GGML_F32xt_LOAD(...) WSP_GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
190
165
|
#define WSP_GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
|
191
166
|
#define WSP_GGML_F32xt_STORE(...) WSP_GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
192
|
-
#define WSP_GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg,
|
|
167
|
+
#define WSP_GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
|
193
168
|
#define WSP_GGML_F32xt_FMA(...) WSP_GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
194
169
|
#define WSP_GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
|
195
170
|
#define WSP_GGML_F32xt_ADD(...) WSP_GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
@@ -220,6 +195,47 @@ inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) {
|
|
|
220
195
|
#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32xt_MUL
|
|
221
196
|
#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32xt_REDUCE
|
|
222
197
|
|
|
198
|
+
// F16 SVE
|
|
199
|
+
#define DEFAULT_PG32 svptrue_b32()
|
|
200
|
+
#define DEFAULT_PG16 svptrue_b16()
|
|
201
|
+
|
|
202
|
+
#define WSP_GGML_F32Cxt svfloat16_t
|
|
203
|
+
#define WSP_GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
|
204
|
+
#define WSP_GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
|
205
|
+
#define WSP_GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
|
206
|
+
#define WSP_GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
|
207
|
+
|
|
208
|
+
#define WSP_GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
|
209
|
+
#define WSP_GGML_F32Cxt_FMA(...) WSP_GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
210
|
+
#define WSP_GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
|
211
|
+
#define WSP_GGML_F32Cxt_ADD(...) WSP_GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
212
|
+
#define WSP_GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
|
213
|
+
#define WSP_GGML_F32Cxt_MUL(...) WSP_GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
214
|
+
#define WSP_GGML_F32Cxt_REDUCE WSP_GGML_F16xt_REDUCE_MIXED
|
|
215
|
+
|
|
216
|
+
#define WSP_GGML_F16x_VEC WSP_GGML_F32Cxt
|
|
217
|
+
#define WSP_GGML_F16x_VEC_ZERO WSP_GGML_F32Cxt_ZERO
|
|
218
|
+
#define WSP_GGML_F16x_VEC_SET1 WSP_GGML_F32Cxt_SET1
|
|
219
|
+
#define WSP_GGML_F16x_VEC_LOAD(p, i) WSP_GGML_F32Cxt_LOAD(p)
|
|
220
|
+
#define WSP_GGML_F16x_VEC_STORE(p, r, i) WSP_GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
|
221
|
+
#define WSP_GGML_F16x_VEC_FMA WSP_GGML_F32Cxt_FMA
|
|
222
|
+
#define WSP_GGML_F16x_VEC_ADD WSP_GGML_F32Cxt_ADD
|
|
223
|
+
#define WSP_GGML_F16x_VEC_MUL WSP_GGML_F32Cxt_MUL
|
|
224
|
+
#define WSP_GGML_F16x_VEC_REDUCE WSP_GGML_F32Cxt_REDUCE
|
|
225
|
+
|
|
226
|
+
#define WSP_GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
|
227
|
+
#define WSP_GGML_F16xt_REDUCE_ONE(...) WSP_GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
228
|
+
|
|
229
|
+
#define WSP_GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
|
230
|
+
{ \
|
|
231
|
+
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
|
232
|
+
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
|
233
|
+
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
|
234
|
+
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
|
235
|
+
(res) = (wsp_ggml_float) sum_f16; \
|
|
236
|
+
}
|
|
237
|
+
#define WSP_GGML_F16xt_REDUCE_MIXED(...) WSP_GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
238
|
+
|
|
223
239
|
// F16 NEON
|
|
224
240
|
|
|
225
241
|
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
@@ -982,9 +998,9 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
|
|
|
982
998
|
#define WSP_GGML_F32_EPR 4
|
|
983
999
|
|
|
984
1000
|
#define WSP_GGML_F32x4 __m128
|
|
985
|
-
#define WSP_GGML_F32x4_ZERO __lsx_vldi(0)
|
|
986
|
-
#define WSP_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
987
|
-
#define WSP_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
|
1001
|
+
#define WSP_GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
|
1002
|
+
#define WSP_GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
1003
|
+
#define WSP_GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
|
988
1004
|
#define WSP_GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
|
989
1005
|
#define WSP_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
|
990
1006
|
#define WSP_GGML_F32x4_ADD __lsx_vfadd_s
|
|
@@ -1006,7 +1022,7 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
|
|
|
1006
1022
|
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
|
1007
1023
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
|
1008
1024
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
|
1009
|
-
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
|
1025
|
+
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
|
|
1010
1026
|
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
|
1011
1027
|
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
|
1012
1028
|
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
|
@@ -1036,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const wsp_ggml_fp16_t * x) {
|
|
|
1036
1052
|
tmp[2] = WSP_GGML_CPU_FP16_TO_FP32(x[2]);
|
|
1037
1053
|
tmp[3] = WSP_GGML_CPU_FP16_TO_FP32(x[3]);
|
|
1038
1054
|
|
|
1039
|
-
return __lsx_vld(tmp, 0);
|
|
1055
|
+
return (__m128)__lsx_vld(tmp, 0);
|
|
1040
1056
|
}
|
|
1041
1057
|
|
|
1042
1058
|
static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
|
|
@@ -1051,9 +1067,9 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
|
|
|
1051
1067
|
}
|
|
1052
1068
|
|
|
1053
1069
|
#define WSP_GGML_F32Cx4 __m128
|
|
1054
|
-
#define WSP_GGML_F32Cx4_ZERO __lsx_vldi(0)
|
|
1055
|
-
#define WSP_GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
1056
|
-
#define WSP_GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
|
1070
|
+
#define WSP_GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
|
1071
|
+
#define WSP_GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
1072
|
+
#define WSP_GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
|
1057
1073
|
#define WSP_GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
|
1058
1074
|
#define WSP_GGML_F32Cx4_FMA WSP_GGML_F32x4_FMA
|
|
1059
1075
|
#define WSP_GGML_F32Cx4_ADD __lsx_vfadd_s
|
|
@@ -1120,11 +1136,6 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
|
|
|
1120
1136
|
#define WSP_GGML_F16_EPR WSP_GGML_F32_EPR
|
|
1121
1137
|
|
|
1122
1138
|
static inline float32x4_t __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
|
|
1123
|
-
#if defined(__NNPA__)
|
|
1124
|
-
uint16x8_t v_x = vec_xl(0, (const wsp_ggml_fp16_t *)x);
|
|
1125
|
-
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1126
|
-
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1127
|
-
#else
|
|
1128
1139
|
float tmp[4];
|
|
1129
1140
|
|
|
1130
1141
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1134,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
|
|
|
1134
1145
|
// note: keep type-cast here to prevent compiler bugs
|
|
1135
1146
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
1136
1147
|
return vec_xl(0, (const float *)(tmp));
|
|
1137
|
-
#endif
|
|
1138
1148
|
}
|
|
1139
1149
|
|
|
1140
1150
|
static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
|
|
1141
|
-
#if defined(__NNPA__)
|
|
1142
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
1143
|
-
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1144
|
-
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1145
|
-
|
|
1146
|
-
x[0] = vec_extract(v_x, 0);
|
|
1147
|
-
x[1] = vec_extract(v_x, 1);
|
|
1148
|
-
x[2] = vec_extract(v_x, 2);
|
|
1149
|
-
x[3] = vec_extract(v_x, 3);
|
|
1150
|
-
#else
|
|
1151
1151
|
float arr[4];
|
|
1152
1152
|
|
|
1153
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
@@ -1157,7 +1157,6 @@ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1157
1157
|
for (int i = 0; i < 4; i++) {
|
|
1158
1158
|
x[i] = WSP_GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
1159
1159
|
}
|
|
1160
|
-
#endif
|
|
1161
1160
|
}
|
|
1162
1161
|
|
|
1163
1162
|
#define WSP_GGML_F16_VEC WSP_GGML_F32x4
|
|
@@ -1170,6 +1169,36 @@ static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1170
1169
|
#define WSP_GGML_F16_VEC_MUL WSP_GGML_F32x4_MUL
|
|
1171
1170
|
#define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32x4_REDUCE
|
|
1172
1171
|
|
|
1172
|
+
#elif defined(__riscv_v_intrinsic)
|
|
1173
|
+
|
|
1174
|
+
// compatible with vlen >= 128
|
|
1175
|
+
|
|
1176
|
+
#define WSP_GGML_SIMD
|
|
1177
|
+
|
|
1178
|
+
// F32
|
|
1179
|
+
|
|
1180
|
+
#define WSP_GGML_F32_STEP 16
|
|
1181
|
+
#define WSP_GGML_F32_EPR 4
|
|
1182
|
+
|
|
1183
|
+
#define WSP_GGML_F32x4 vfloat32m1_t
|
|
1184
|
+
#define WSP_GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, WSP_GGML_F32_EPR)
|
|
1185
|
+
#define WSP_GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, WSP_GGML_F32_EPR)
|
|
1186
|
+
#define WSP_GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, WSP_GGML_F32_EPR)
|
|
1187
|
+
#define WSP_GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, WSP_GGML_F32_EPR)
|
|
1188
|
+
#define WSP_GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, WSP_GGML_F32_EPR)
|
|
1189
|
+
#define WSP_GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, WSP_GGML_F32_EPR)
|
|
1190
|
+
#define WSP_GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, WSP_GGML_F32_EPR)
|
|
1191
|
+
|
|
1192
|
+
#define WSP_GGML_F32_VEC WSP_GGML_F32x4
|
|
1193
|
+
#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO
|
|
1194
|
+
#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1
|
|
1195
|
+
#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD
|
|
1196
|
+
#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE
|
|
1197
|
+
#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA
|
|
1198
|
+
#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD
|
|
1199
|
+
#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL
|
|
1200
|
+
#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE
|
|
1201
|
+
|
|
1173
1202
|
#endif
|
|
1174
1203
|
|
|
1175
1204
|
// WSP_GGML_F32_ARR / WSP_GGML_F16_ARR
|
package/cpp/ggml-cpu/traits.cpp
CHANGED
|
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
|
|
10
10
|
} // namespace ggml::cpu
|
|
11
11
|
|
|
12
12
|
bool wsp_ggml_cpu_extra_compute_forward(struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * op) {
|
|
13
|
-
for (auto extra :
|
|
13
|
+
for (auto extra : wsp_ggml_backend_cpu_get_extra_buffer_types()) {
|
|
14
14
|
if (extra && extra->context) {
|
|
15
15
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
16
16
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
|
@@ -23,7 +23,7 @@ bool wsp_ggml_cpu_extra_compute_forward(struct wsp_ggml_compute_params * params,
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
bool wsp_ggml_cpu_extra_work_size(int n_threads, const struct wsp_ggml_tensor * op, size_t * size) {
|
|
26
|
-
for (auto extra :
|
|
26
|
+
for (auto extra : wsp_ggml_backend_cpu_get_extra_buffer_types()) {
|
|
27
27
|
if (extra && extra->context) {
|
|
28
28
|
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
|
29
29
|
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
package/cpp/ggml-cpu/traits.h
CHANGED
|
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
|
|
33
33
|
} // namespace ggml::cpu
|
|
34
34
|
|
|
35
35
|
// implemented in ggml-cpu.cpp.
|
|
36
|
-
std::vector<wsp_ggml_backend_buffer_type_t> &
|
|
36
|
+
std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buffer_types();
|
|
37
37
|
|
|
38
38
|
#endif
|
package/cpp/ggml-cpu/vec.cpp
CHANGED
|
@@ -37,35 +37,35 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
|
|
|
37
37
|
for (int i = 0; i < np; i += wsp_ggml_f32_step) {
|
|
38
38
|
ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
|
|
39
39
|
ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
|
|
40
|
-
sum1 = WSP_GGML_F32_VEC_FMA(ax1, ay1
|
|
40
|
+
sum1 = WSP_GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
41
41
|
|
|
42
42
|
ax2 = WSP_GGML_F32_VEC_LOAD(x + i + 1*wsp_ggml_f32_epr);
|
|
43
43
|
ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
|
|
44
|
-
sum2 = WSP_GGML_F32_VEC_FMA(ax2, ay2
|
|
44
|
+
sum2 = WSP_GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
|
45
45
|
|
|
46
46
|
ax3 = WSP_GGML_F32_VEC_LOAD(x + i + 2*wsp_ggml_f32_epr);
|
|
47
47
|
ay3 = WSP_GGML_F32_VEC_LOAD(y + i + 2*wsp_ggml_f32_epr);
|
|
48
|
-
sum3 = WSP_GGML_F32_VEC_FMA(ax3, ay3
|
|
48
|
+
sum3 = WSP_GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
|
49
49
|
|
|
50
50
|
ax4 = WSP_GGML_F32_VEC_LOAD(x + i + 3*wsp_ggml_f32_epr);
|
|
51
51
|
ay4 = WSP_GGML_F32_VEC_LOAD(y + i + 3*wsp_ggml_f32_epr);
|
|
52
|
-
sum4 = WSP_GGML_F32_VEC_FMA(ax4, ay4
|
|
52
|
+
sum4 = WSP_GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
|
53
53
|
|
|
54
54
|
ax5 = WSP_GGML_F32_VEC_LOAD(x + i + 4*wsp_ggml_f32_epr);
|
|
55
55
|
ay5 = WSP_GGML_F32_VEC_LOAD(y + i + 4*wsp_ggml_f32_epr);
|
|
56
|
-
sum5 = WSP_GGML_F32_VEC_FMA(ax5, ay5
|
|
56
|
+
sum5 = WSP_GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
|
57
57
|
|
|
58
58
|
ax6 = WSP_GGML_F32_VEC_LOAD(x + i + 5*wsp_ggml_f32_epr);
|
|
59
59
|
ay6 = WSP_GGML_F32_VEC_LOAD(y + i + 5*wsp_ggml_f32_epr);
|
|
60
|
-
sum6 = WSP_GGML_F32_VEC_FMA(ax6, ay6
|
|
60
|
+
sum6 = WSP_GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
|
61
61
|
|
|
62
62
|
ax7 = WSP_GGML_F32_VEC_LOAD(x + i + 6*wsp_ggml_f32_epr);
|
|
63
63
|
ay7 = WSP_GGML_F32_VEC_LOAD(y + i + 6*wsp_ggml_f32_epr);
|
|
64
|
-
sum7 = WSP_GGML_F32_VEC_FMA(ax7, ay7
|
|
64
|
+
sum7 = WSP_GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
|
65
65
|
|
|
66
66
|
ax8 = WSP_GGML_F32_VEC_LOAD(x + i + 7*wsp_ggml_f32_epr);
|
|
67
67
|
ay8 = WSP_GGML_F32_VEC_LOAD(y + i + 7*wsp_ggml_f32_epr);
|
|
68
|
-
sum8 = WSP_GGML_F32_VEC_FMA(ax8, ay8
|
|
68
|
+
sum8 = WSP_GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
|
69
69
|
}
|
|
70
70
|
// leftovers
|
|
71
71
|
// Since 8 unrolls are done in above loop, leftovers lie in range [0, wsp_ggml_f32_step] which is handled in below loop
|
|
@@ -73,7 +73,7 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
|
|
|
73
73
|
for (int i = np; i < np2; i += wsp_ggml_f32_epr) {
|
|
74
74
|
ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
|
|
75
75
|
ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
|
|
76
|
-
sum1 = WSP_GGML_F32_VEC_FMA(ax1, ay1
|
|
76
|
+
sum1 = WSP_GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
77
77
|
}
|
|
78
78
|
// maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
|
|
79
79
|
if (np2 < n) {
|
|
@@ -84,6 +84,22 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
|
|
|
84
84
|
}
|
|
85
85
|
// reduce sum1,sum2 to sum1
|
|
86
86
|
WSP_GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
|
+
#elif defined(__riscv_v_intrinsic)
|
|
88
|
+
int vl = __riscv_vsetvlmax_e32m8();
|
|
89
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
90
|
+
vfloat32m8_t vsum;
|
|
91
|
+
vfloat32m8_t ax;
|
|
92
|
+
vfloat32m8_t ay;
|
|
93
|
+
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
|
94
|
+
for (int i = 0; i < n; i += vl) {
|
|
95
|
+
vl = __riscv_vsetvl_e32m8(n - i);
|
|
96
|
+
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
|
97
|
+
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
|
98
|
+
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
|
99
|
+
}
|
|
100
|
+
vl = __riscv_vsetvlmax_e32m8();
|
|
101
|
+
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
|
102
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
87
103
|
#else
|
|
88
104
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
89
105
|
|
|
@@ -197,35 +213,125 @@ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggm
|
|
|
197
213
|
|
|
198
214
|
wsp_ggml_float sumf = 0.0;
|
|
199
215
|
|
|
216
|
+
|
|
200
217
|
#if defined(WSP_GGML_SIMD)
|
|
201
|
-
|
|
218
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
219
|
+
const int sve_register_length = svcntb() * 8; //get vector length
|
|
220
|
+
const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
221
|
+
const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
|
|
222
|
+
|
|
223
|
+
const int np= (n & ~(wsp_ggml_f16_step - 1));
|
|
224
|
+
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
|
225
|
+
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
|
226
|
+
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
|
227
|
+
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
|
228
|
+
|
|
229
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
230
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
231
|
+
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
232
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
|
|
233
|
+
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
|
|
234
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
|
235
|
+
|
|
236
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
|
|
237
|
+
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
|
|
238
|
+
sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
|
239
|
+
|
|
240
|
+
ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
|
|
241
|
+
ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
|
|
242
|
+
sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
|
243
|
+
|
|
244
|
+
ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
|
|
245
|
+
ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
|
|
246
|
+
sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
|
247
|
+
|
|
248
|
+
ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
|
|
249
|
+
ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
|
|
250
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
|
251
|
+
|
|
252
|
+
ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
|
|
253
|
+
ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
|
|
254
|
+
sum2 = WSP_GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
|
255
|
+
|
|
256
|
+
ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
|
|
257
|
+
ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
|
|
258
|
+
sum3 = WSP_GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
|
259
|
+
|
|
260
|
+
ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
|
|
261
|
+
ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
|
|
262
|
+
sum4 = WSP_GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const int np2 = (n & ~(wsp_ggml_f16_epr - 1)); // round down to multiple of 8
|
|
266
|
+
for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
|
|
267
|
+
svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
|
|
268
|
+
svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
|
|
269
|
+
sum1 = WSP_GGML_F16x_VEC_FMA(sum1, rx, ry);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (np2 < n) {
|
|
273
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
274
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
275
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
202
276
|
|
|
203
|
-
|
|
277
|
+
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
|
278
|
+
}
|
|
279
|
+
WSP_GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
|
280
|
+
#elif defined(__riscv_v_intrinsic)
|
|
281
|
+
#if defined(__riscv_zvfh)
|
|
282
|
+
int vl = __riscv_vsetvlmax_e32m2();
|
|
283
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
284
|
+
vfloat32m2_t vsum;
|
|
285
|
+
vfloat16m1_t ax;
|
|
286
|
+
vfloat16m1_t ay;
|
|
287
|
+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
|
288
|
+
for (int i = 0; i < n; i += vl) {
|
|
289
|
+
vl = __riscv_vsetvl_e16m1(n - i);
|
|
290
|
+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
|
291
|
+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
|
292
|
+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
|
293
|
+
}
|
|
294
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
295
|
+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
|
296
|
+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
|
297
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
298
|
+
#else
|
|
299
|
+
for (int i = 0; i < n; ++i) {
|
|
300
|
+
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
301
|
+
}
|
|
302
|
+
#endif // __riscv_zvfh
|
|
303
|
+
#else
|
|
304
|
+
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
305
|
+
|
|
306
|
+
WSP_GGML_F16_VEC sum[WSP_GGML_F16_ARR] = { WSP_GGML_F16_VEC_ZERO };
|
|
204
307
|
|
|
205
|
-
|
|
206
|
-
|
|
308
|
+
WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
|
|
309
|
+
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
207
310
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
311
|
+
for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
|
|
312
|
+
for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
|
|
313
|
+
ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
|
|
314
|
+
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
212
315
|
|
|
213
|
-
|
|
316
|
+
sum[j] = WSP_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
|
317
|
+
}
|
|
214
318
|
}
|
|
215
|
-
}
|
|
216
319
|
|
|
217
|
-
|
|
218
|
-
|
|
320
|
+
// reduce sum0..sum3 to sum0
|
|
321
|
+
WSP_GGML_F16_VEC_REDUCE(sumf, sum);
|
|
219
322
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
323
|
+
// leftovers
|
|
324
|
+
for (int i = np; i < n; ++i) {
|
|
325
|
+
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
326
|
+
}
|
|
327
|
+
// if you hit this, you are likely running outside the FP range
|
|
328
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
329
|
+
#endif
|
|
224
330
|
#else
|
|
225
331
|
for (int i = 0; i < n; ++i) {
|
|
226
332
|
sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
227
333
|
}
|
|
228
|
-
#endif
|
|
334
|
+
#endif // WSP_GGML_SIMD
|
|
229
335
|
|
|
230
336
|
*s = sumf;
|
|
231
337
|
}
|
|
@@ -244,6 +350,12 @@ void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
244
350
|
for (; i + 3 < n; i += 4) {
|
|
245
351
|
_mm_storeu_ps(y + i, wsp_ggml_v_silu(_mm_loadu_ps(x + i)));
|
|
246
352
|
}
|
|
353
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
354
|
+
const int vlen = svcntw();
|
|
355
|
+
for (; i < n; i += vlen) {
|
|
356
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
357
|
+
svst1_f32(pg, y + i, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
|
358
|
+
}
|
|
247
359
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
248
360
|
for (; i + 3 < n; i += 4) {
|
|
249
361
|
vst1q_f32(y + i, wsp_ggml_v_silu(vld1q_f32(x + i)));
|
|
@@ -268,10 +380,24 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
|
|
|
268
380
|
for (; i + 3 < n; i += 4) {
|
|
269
381
|
_mm_storeu_ps(y + i, _mm_mul_ps(wsp_ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
270
382
|
}
|
|
383
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
384
|
+
const int vlen = svcntw();
|
|
385
|
+
for (; i < n; i += vlen) {
|
|
386
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
387
|
+
svst1_f32(pg, y + i, svmul_f32_x(pg, wsp_ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
|
388
|
+
}
|
|
271
389
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
272
390
|
for (; i + 3 < n; i += 4) {
|
|
273
391
|
vst1q_f32(y + i, vmulq_f32(wsp_ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
274
392
|
}
|
|
393
|
+
#elif defined(__riscv_v_intrinsic)
|
|
394
|
+
for (int vl; i < n; i += vl) {
|
|
395
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
396
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
397
|
+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
|
398
|
+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(wsp_ggml_v_silu_m2(vx, vl), vg, vl);
|
|
399
|
+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
|
400
|
+
}
|
|
275
401
|
#endif
|
|
276
402
|
for (; i < n; ++i) {
|
|
277
403
|
y[i] = wsp_ggml_silu_f32(x[i]) * g[i];
|
|
@@ -315,6 +441,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
|
|
|
315
441
|
#endif
|
|
316
442
|
sum += (wsp_ggml_float)_mm_cvtss_f32(val);
|
|
317
443
|
}
|
|
444
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
445
|
+
const int vlen = svcntw();
|
|
446
|
+
for (; i < n; i += vlen) {
|
|
447
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
448
|
+
svfloat32_t val = wsp_ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
|
449
|
+
svdup_n_f32_x(pg, max)));
|
|
450
|
+
svst1_f32(pg, y + i, val);
|
|
451
|
+
sum += (wsp_ggml_float)svaddv_f32(pg, val);
|
|
452
|
+
}
|
|
318
453
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
319
454
|
for (; i + 3 < n; i += 4) {
|
|
320
455
|
float32x4_t val = wsp_ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
|
@@ -322,6 +457,15 @@ wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x
|
|
|
322
457
|
vst1q_f32(y + i, val);
|
|
323
458
|
sum += (wsp_ggml_float)vaddvq_f32(val);
|
|
324
459
|
}
|
|
460
|
+
#elif defined(__riscv_v_intrinsic)
|
|
461
|
+
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
|
462
|
+
for (int avl; i < n; i += avl) {
|
|
463
|
+
avl = __riscv_vsetvl_e32m2(n - i);
|
|
464
|
+
vfloat32m2_t val = wsp_ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
|
|
465
|
+
__riscv_vse32_v_f32m2(&y[i], val, avl);
|
|
466
|
+
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
|
|
467
|
+
}
|
|
468
|
+
return (wsp_ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
|
325
469
|
#endif
|
|
326
470
|
for (; i < n; ++i) {
|
|
327
471
|
float val = expf(x[i] - max);
|