whisper.rn 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +264 -126
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +13 -5
- package/cpp/ggml-backend.cpp +207 -17
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/cpp/ggml-cpu/arch-fallback.h +0 -4
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +48 -41
- package/cpp/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/ggml-cpu/ops.cpp +518 -767
- package/cpp/ggml-cpu/ops.h +2 -0
- package/cpp/ggml-cpu/simd-mappings.h +88 -59
- package/cpp/ggml-cpu/vec.cpp +161 -20
- package/cpp/ggml-cpu/vec.h +400 -51
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +43 -10
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +40 -40
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-quants.c +1 -0
- package/cpp/ggml.c +175 -13
- package/cpp/ggml.h +84 -5
- package/cpp/jsi/RNWhisperJSI.cpp +2 -0
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +85 -70
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisperVadContext.mm +14 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +43 -10
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +40 -40
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +84 -5
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6779
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml-cpu/vec.h
CHANGED
|
@@ -119,36 +119,149 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
#if defined(WSP_GGML_SIMD)
|
|
122
|
-
|
|
122
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
123
|
+
|
|
124
|
+
const int sve_register_length = svcntb() * 8;
|
|
125
|
+
const int wsp_ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
126
|
+
const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr; // choose 8 SVE registers
|
|
127
|
+
|
|
128
|
+
const int np = (n & ~(wsp_ggml_f16_step - 1));
|
|
129
|
+
|
|
130
|
+
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
|
131
|
+
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
|
132
|
+
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
|
133
|
+
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
|
134
|
+
|
|
135
|
+
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
|
136
|
+
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
|
137
|
+
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
|
138
|
+
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
|
139
|
+
|
|
140
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
141
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
142
|
+
|
|
143
|
+
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
144
|
+
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
|
|
145
|
+
|
|
146
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elemnst
|
|
147
|
+
sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
148
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
|
|
149
|
+
sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
150
|
+
|
|
151
|
+
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
|
|
152
|
+
|
|
153
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 ekements
|
|
154
|
+
sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
155
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
|
|
156
|
+
sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
157
|
+
|
|
158
|
+
ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
|
|
159
|
+
|
|
160
|
+
ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
|
|
161
|
+
sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
162
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
|
|
163
|
+
sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
164
|
+
|
|
165
|
+
ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
|
|
166
|
+
|
|
167
|
+
ax4 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 3*wsp_ggml_f16_epr, 3);
|
|
168
|
+
sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
|
169
|
+
ax4 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 3*wsp_ggml_f16_epr, 3);
|
|
170
|
+
sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
|
171
|
+
|
|
172
|
+
ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
|
|
173
|
+
|
|
174
|
+
ax5 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 4*wsp_ggml_f16_epr, 4);
|
|
175
|
+
|
|
176
|
+
sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
|
177
|
+
ax5 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 4*wsp_ggml_f16_epr, 4);
|
|
178
|
+
sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
|
179
|
+
|
|
180
|
+
ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
|
|
181
|
+
|
|
182
|
+
ax6 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 5*wsp_ggml_f16_epr, 5);
|
|
183
|
+
|
|
184
|
+
sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
|
185
|
+
ax6 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 5*wsp_ggml_f16_epr, 5);
|
|
186
|
+
sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
|
187
|
+
|
|
188
|
+
ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
|
|
189
|
+
|
|
190
|
+
ax7 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 6*wsp_ggml_f16_epr, 6);
|
|
191
|
+
|
|
192
|
+
sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
|
193
|
+
ax7 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 6*wsp_ggml_f16_epr, 6);
|
|
194
|
+
sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
|
195
|
+
|
|
196
|
+
ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
|
|
197
|
+
|
|
198
|
+
ax8 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 7*wsp_ggml_f16_epr, 7);
|
|
199
|
+
|
|
200
|
+
sum_03 = WSP_GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
|
201
|
+
ax8 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 7*wsp_ggml_f16_epr, 7);
|
|
202
|
+
sum_13 = WSP_GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
|
|
206
|
+
for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
|
|
207
|
+
svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
|
|
208
|
+
|
|
209
|
+
svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
|
210
|
+
sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
|
211
|
+
rx = WSP_GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
|
212
|
+
sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (np2 < n) {
|
|
216
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
217
|
+
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
|
218
|
+
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
|
219
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
220
|
+
|
|
221
|
+
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
|
222
|
+
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
|
223
|
+
}
|
|
224
|
+
WSP_GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
|
225
|
+
WSP_GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
|
226
|
+
#elif defined(__riscv_v_intrinsic)
|
|
227
|
+
// todo: RVV impl
|
|
228
|
+
for (int i = 0; i < n; ++i) {
|
|
229
|
+
for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
|
|
230
|
+
sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
#else
|
|
234
|
+
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
123
235
|
|
|
124
|
-
|
|
236
|
+
WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } };
|
|
125
237
|
|
|
126
|
-
|
|
127
|
-
|
|
238
|
+
WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
|
|
239
|
+
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
128
240
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
241
|
+
for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
|
|
242
|
+
for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
|
|
243
|
+
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
132
244
|
|
|
133
|
-
|
|
134
|
-
|
|
245
|
+
for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
|
|
246
|
+
ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j);
|
|
135
247
|
|
|
136
|
-
|
|
248
|
+
sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
|
249
|
+
}
|
|
137
250
|
}
|
|
138
251
|
}
|
|
139
|
-
}
|
|
140
252
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
253
|
+
// reduce sum0..sum3 to sum0
|
|
254
|
+
for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) {
|
|
255
|
+
WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
|
256
|
+
}
|
|
145
257
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
258
|
+
// leftovers
|
|
259
|
+
for (int i = np; i < n; ++i) {
|
|
260
|
+
for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
|
|
261
|
+
sumf[j] += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[j][i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
|
|
262
|
+
}
|
|
150
263
|
}
|
|
151
|
-
|
|
264
|
+
#endif
|
|
152
265
|
#else
|
|
153
266
|
for (int i = 0; i < n; ++i) {
|
|
154
267
|
for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) {
|
|
@@ -243,6 +356,14 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
|
|
|
243
356
|
|
|
244
357
|
svst1_f32(pg, y + np2, ay1);
|
|
245
358
|
}
|
|
359
|
+
#elif defined(__riscv_v_intrinsic)
|
|
360
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
361
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
362
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
363
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
364
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
|
|
365
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
366
|
+
}
|
|
246
367
|
#else
|
|
247
368
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
248
369
|
|
|
@@ -276,27 +397,112 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
|
|
|
276
397
|
|
|
277
398
|
inline static void wsp_ggml_vec_mad_f16(const int n, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, const wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, const float v) {
|
|
278
399
|
#if defined(WSP_GGML_SIMD)
|
|
279
|
-
|
|
400
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
|
402
|
+
const int wsp_ggml_f16_epr = sve_register_length / 16;
|
|
403
|
+
const int wsp_ggml_f16_step = 8 * wsp_ggml_f16_epr;
|
|
404
|
+
|
|
405
|
+
WSP_GGML_F16x_VEC vx = WSP_GGML_F16x_VEC_SET1(v);
|
|
406
|
+
|
|
407
|
+
const int np= (n & ~(wsp_ggml_f16_step - 1));
|
|
408
|
+
|
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
411
|
+
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
412
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x + i + 0 * wsp_ggml_f16_epr, 0);
|
|
413
|
+
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0);
|
|
414
|
+
ay1 = WSP_GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
|
415
|
+
|
|
416
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 0 * wsp_ggml_f16_epr, ay1, 0);
|
|
417
|
+
|
|
418
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x + i + 1 * wsp_ggml_f16_epr, 1);
|
|
419
|
+
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1);
|
|
420
|
+
ay2 = WSP_GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
|
421
|
+
|
|
422
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 1 * wsp_ggml_f16_epr, ay2, 1);
|
|
423
|
+
|
|
424
|
+
ax3 = WSP_GGML_F16x_VEC_LOAD(x + i + 2 * wsp_ggml_f16_epr, 2);
|
|
425
|
+
ay3 = WSP_GGML_F16x_VEC_LOAD(y + i + 2 * wsp_ggml_f16_epr, 2);
|
|
426
|
+
ay3 = WSP_GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
|
427
|
+
|
|
428
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 2 * wsp_ggml_f16_epr, ay3, 2);
|
|
429
|
+
|
|
430
|
+
ax4 = WSP_GGML_F16x_VEC_LOAD(x + i + 3 * wsp_ggml_f16_epr, 3);
|
|
431
|
+
ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
|
|
432
|
+
ay4 = WSP_GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
|
433
|
+
|
|
434
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 3 * wsp_ggml_f16_epr, ay4, 3);
|
|
435
|
+
|
|
436
|
+
ax5 = WSP_GGML_F16x_VEC_LOAD(x + i + 4 * wsp_ggml_f16_epr, 4);
|
|
437
|
+
ay5 = WSP_GGML_F16x_VEC_LOAD(y + i + 4 * wsp_ggml_f16_epr, 4);
|
|
438
|
+
ay5 = WSP_GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
|
439
|
+
|
|
440
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 4 * wsp_ggml_f16_epr, ay5, 4);
|
|
441
|
+
|
|
442
|
+
ax6 = WSP_GGML_F16x_VEC_LOAD(x + i + 5 * wsp_ggml_f16_epr, 5);
|
|
443
|
+
ay6 = WSP_GGML_F16x_VEC_LOAD(y + i + 5 * wsp_ggml_f16_epr, 5);
|
|
444
|
+
ay6 = WSP_GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
|
445
|
+
|
|
446
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 5 * wsp_ggml_f16_epr, ay6, 5);
|
|
280
447
|
|
|
281
|
-
|
|
448
|
+
ax7 = WSP_GGML_F16x_VEC_LOAD(x + i + 6 * wsp_ggml_f16_epr, 6);
|
|
449
|
+
ay7 = WSP_GGML_F16x_VEC_LOAD(y + i + 6 * wsp_ggml_f16_epr, 6);
|
|
450
|
+
ay7 = WSP_GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
|
282
451
|
|
|
283
|
-
|
|
284
|
-
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
452
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 6 * wsp_ggml_f16_epr, ay7, 6);
|
|
285
453
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
290
|
-
ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
454
|
+
ax8 = WSP_GGML_F16x_VEC_LOAD(x + i + 7 * wsp_ggml_f16_epr, 7);
|
|
455
|
+
ay8 = WSP_GGML_F16x_VEC_LOAD(y + i + 7 * wsp_ggml_f16_epr, 7);
|
|
456
|
+
ay8 = WSP_GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
|
291
457
|
|
|
292
|
-
|
|
458
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 7 * wsp_ggml_f16_epr, ay8, 7);
|
|
293
459
|
}
|
|
294
|
-
|
|
460
|
+
const int np2 = (n & ~(wsp_ggml_f16_epr - 1));
|
|
461
|
+
for (int k = np; k < np2; k += wsp_ggml_f16_epr) {
|
|
462
|
+
svfloat16_t rx = WSP_GGML_F16x_VEC_LOAD(x + k, 0);
|
|
463
|
+
svfloat16_t ry = WSP_GGML_F16x_VEC_LOAD(y + k, 0);
|
|
464
|
+
ry = WSP_GGML_F16x_VEC_FMA(ry, rx, vx);
|
|
295
465
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
466
|
+
WSP_GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (np2 < n) {
|
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#elif defined(__riscv_v_intrinsic)
|
|
478
|
+
// todo: RVV impl
|
|
479
|
+
// scalar
|
|
480
|
+
for (int i = 0; i < n; ++i) {
|
|
481
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
482
|
+
}
|
|
483
|
+
#else
|
|
484
|
+
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
485
|
+
|
|
486
|
+
WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
|
|
487
|
+
|
|
488
|
+
WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR];
|
|
489
|
+
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
490
|
+
|
|
491
|
+
for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
|
|
492
|
+
for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
|
|
493
|
+
ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j);
|
|
494
|
+
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
495
|
+
ay[j] = WSP_GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
496
|
+
|
|
497
|
+
WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// leftovers
|
|
502
|
+
for (int i = np; i < n; ++i) {
|
|
503
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i]) + WSP_GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
504
|
+
}
|
|
505
|
+
#endif
|
|
300
506
|
#else
|
|
301
507
|
// scalar
|
|
302
508
|
for (int i = 0; i < n; ++i) {
|
|
@@ -324,6 +530,16 @@ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const
|
|
|
324
530
|
y[i] += x[k][i]*v[k][0];
|
|
325
531
|
}
|
|
326
532
|
}
|
|
533
|
+
#elif defined(__riscv_v_intrinsic)
|
|
534
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
535
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
536
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
537
|
+
for (int k = 0; k < WSP_GGML_VEC_MAD_UNROLL; k++) {
|
|
538
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
|
|
539
|
+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
|
|
540
|
+
}
|
|
541
|
+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
|
|
542
|
+
}
|
|
327
543
|
#else
|
|
328
544
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
329
545
|
|
|
@@ -375,6 +591,14 @@ inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x
|
|
|
375
591
|
for (int i = 0; i < n; ++i) {
|
|
376
592
|
y[i] = x[i]*s + b;
|
|
377
593
|
}
|
|
594
|
+
#elif defined(__riscv_v_intrinsic)
|
|
595
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
596
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
597
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
598
|
+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
|
|
599
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
|
|
600
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
601
|
+
}
|
|
378
602
|
#else
|
|
379
603
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
380
604
|
|
|
@@ -386,7 +610,7 @@ inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x
|
|
|
386
610
|
for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
|
|
387
611
|
for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
|
|
388
612
|
ay[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR);
|
|
389
|
-
ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], vs
|
|
613
|
+
ay[j] = WSP_GGML_F32_VEC_FMA(vb, ay[j], vs);
|
|
390
614
|
|
|
391
615
|
WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
|
|
392
616
|
}
|
|
@@ -436,6 +660,13 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
|
|
|
436
660
|
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
437
661
|
svst1_f32(pg, y + np, ay1);
|
|
438
662
|
}
|
|
663
|
+
#elif defined(__riscv_v_intrinsic)
|
|
664
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
665
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
666
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
667
|
+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
|
|
668
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
669
|
+
}
|
|
439
670
|
#else
|
|
440
671
|
const int np = (n & ~(WSP_GGML_F32_STEP - 1));
|
|
441
672
|
|
|
@@ -467,25 +698,59 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
|
|
|
467
698
|
|
|
468
699
|
inline static void wsp_ggml_vec_scale_f16(const int n, wsp_ggml_fp16_t * y, const float v) {
|
|
469
700
|
#if defined(WSP_GGML_SIMD)
|
|
470
|
-
|
|
701
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
702
|
+
const int sve_register_length = svcntb() * 8;
|
|
703
|
+
const int wsp_ggml_f16_epr = sve_register_length / 16;
|
|
704
|
+
const int wsp_ggml_f16_step = 2 * wsp_ggml_f16_epr;
|
|
705
|
+
|
|
706
|
+
WSP_GGML_F16x_VEC vx = WSP_GGML_F16x_VEC_SET1(v);
|
|
707
|
+
const int np = (n & ~(wsp_ggml_f16_step - 1));
|
|
708
|
+
svfloat16_t ay1, ay2;
|
|
709
|
+
|
|
710
|
+
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
711
|
+
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0*wsp_ggml_f16_epr, 0);
|
|
712
|
+
ay1 = WSP_GGML_F16x_VEC_MUL(ay1, vx);
|
|
713
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 0*wsp_ggml_f16_epr, ay1, 0);
|
|
714
|
+
|
|
715
|
+
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1*wsp_ggml_f16_epr, 1);
|
|
716
|
+
ay2 = WSP_GGML_F16x_VEC_MUL(ay2, vx);
|
|
717
|
+
WSP_GGML_F16x_VEC_STORE(y + i + 1*wsp_ggml_f16_epr, ay2, 1);
|
|
718
|
+
}
|
|
719
|
+
// leftovers
|
|
720
|
+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
|
721
|
+
if (np < n) {
|
|
722
|
+
svbool_t pg = svwhilelt_b16(np, n);
|
|
723
|
+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
|
724
|
+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
|
725
|
+
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
726
|
+
}
|
|
727
|
+
#elif defined(__riscv_v_intrinsic)
|
|
728
|
+
// todo: RVV impl
|
|
729
|
+
// scalar
|
|
730
|
+
for (int i = 0; i < n; ++i) {
|
|
731
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
732
|
+
}
|
|
733
|
+
#else
|
|
734
|
+
const int np = (n & ~(WSP_GGML_F16_STEP - 1));
|
|
471
735
|
|
|
472
|
-
|
|
736
|
+
WSP_GGML_F16_VEC vx = WSP_GGML_F16_VEC_SET1(v);
|
|
473
737
|
|
|
474
|
-
|
|
738
|
+
WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR];
|
|
475
739
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
740
|
+
for (int i = 0; i < np; i += WSP_GGML_F16_STEP) {
|
|
741
|
+
for (int j = 0; j < WSP_GGML_F16_ARR; j++) {
|
|
742
|
+
ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j);
|
|
743
|
+
ay[j] = WSP_GGML_F16_VEC_MUL(ay[j], vx);
|
|
480
744
|
|
|
481
|
-
|
|
745
|
+
WSP_GGML_F16_VEC_STORE(y + i + j*WSP_GGML_F16_EPR, ay, j);
|
|
746
|
+
}
|
|
482
747
|
}
|
|
483
|
-
}
|
|
484
748
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
749
|
+
// leftovers
|
|
750
|
+
for (int i = np; i < n; ++i) {
|
|
751
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
752
|
+
}
|
|
753
|
+
#endif
|
|
489
754
|
#else
|
|
490
755
|
// scalar
|
|
491
756
|
for (int i = 0; i < n; ++i) {
|
|
@@ -737,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
|
|
|
737
1002
|
}
|
|
738
1003
|
#endif
|
|
739
1004
|
|
|
740
|
-
#if defined(
|
|
1005
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
1006
|
+
|
|
1007
|
+
inline static svfloat32_t wsp_ggml_v_expf(svbool_t pg, svfloat32_t x) {
|
|
1008
|
+
const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
|
|
1009
|
+
const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
|
|
1010
|
+
const svfloat32_t n = svsub_f32_x(pg, z, r);
|
|
1011
|
+
const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
|
|
1012
|
+
const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
|
|
1013
|
+
const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
|
|
1014
|
+
const svbool_t c = svacgt_n_f32(pg, n, 126);
|
|
1015
|
+
const svfloat32_t u = svmul_f32_x(pg, b, b);
|
|
1016
|
+
const svfloat32_t j = svmla_f32_x(pg,
|
|
1017
|
+
svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
|
|
1018
|
+
svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
|
|
1019
|
+
svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
|
|
1020
|
+
const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
|
|
1021
|
+
const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
|
|
1022
|
+
const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
|
|
1023
|
+
return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
|
|
1024
|
+
svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1028
|
+
inline static svfloat32_t wsp_ggml_v_silu(svbool_t pg, svfloat32_t x) {
|
|
1029
|
+
const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
|
|
1030
|
+
const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
|
|
1031
|
+
const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
|
|
1032
|
+
const svfloat32_t exp_neg_x = wsp_ggml_v_expf(pg, neg_x);
|
|
1033
|
+
const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
|
|
1034
|
+
return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
741
1038
|
|
|
742
1039
|
// adapted from arm limited optimized routine
|
|
743
1040
|
// the maximum error is 1.45358 plus 0.5 ulps
|
|
@@ -928,7 +1225,59 @@ inline static __m128 wsp_ggml_v_silu(__m128 x) {
|
|
|
928
1225
|
return _mm_div_ps(x, one_plus_exp_neg_x);
|
|
929
1226
|
}
|
|
930
1227
|
|
|
931
|
-
#
|
|
1228
|
+
#elif defined(__riscv_v_intrinsic)
|
|
1229
|
+
|
|
1230
|
+
// adapted from arm limited optimized routine
|
|
1231
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
|
1232
|
+
// numbers above 88.38 will flush to infinity
|
|
1233
|
+
// numbers beneath -103.97 will flush to zero
|
|
1234
|
+
inline static vfloat32m2_t wsp_ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
|
1235
|
+
const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
|
|
1236
|
+
#ifdef __riscv_xtheadvector
|
|
1237
|
+
// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
|
|
1238
|
+
vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
|
|
1239
|
+
z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
|
|
1240
|
+
#else
|
|
1241
|
+
const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
|
|
1242
|
+
#endif
|
|
1243
|
+
const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
|
|
1244
|
+
const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
|
|
1245
|
+
0x1.7f7d1cp-20f, n, vl);
|
|
1246
|
+
const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
|
|
1247
|
+
const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
|
|
1248
|
+
const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
|
|
1249
|
+
const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
|
|
1250
|
+
const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
|
|
1251
|
+
__riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
|
|
1252
|
+
__riscv_vfmacc_vv_f32m2(
|
|
1253
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
|
|
1254
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
|
|
1255
|
+
u, vl), u, vl);
|
|
1256
|
+
if (!__riscv_vcpop_m_b16(c, vl))
|
|
1257
|
+
return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
|
|
1258
|
+
const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
|
|
1259
|
+
const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
|
|
1260
|
+
const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
|
|
1261
|
+
const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
|
|
1262
|
+
const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
|
|
1263
|
+
__riscv_vfmacc_vv_f32m2(k, k, j, vl),
|
|
1264
|
+
__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
|
|
1265
|
+
c, vl);
|
|
1266
|
+
return __riscv_vmerge_vvm_f32m2(
|
|
1267
|
+
r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
|
|
1268
|
+
__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
|
|
1269
|
+
vl);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1273
|
+
inline static vfloat32m2_t wsp_ggml_v_silu_m2(vfloat32m2_t x, int vl) {
|
|
1274
|
+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
|
|
1275
|
+
const vfloat32m2_t exp_neg_x = wsp_ggml_v_expf_m2(neg_x, vl);
|
|
1276
|
+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
|
|
1277
|
+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
|
932
1281
|
|
|
933
1282
|
inline static void wsp_ggml_vec_silu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
|
|
934
1283
|
for (int i = 0; i < n; ++i) {
|
package/cpp/ggml-cpu.h
CHANGED
|
@@ -101,7 +101,6 @@ extern "C" {
|
|
|
101
101
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v (void);
|
|
102
102
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx (void);
|
|
103
103
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe (void);
|
|
104
|
-
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa (void);
|
|
105
104
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd (void);
|
|
106
105
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile (void);
|
|
107
106
|
|
|
@@ -135,6 +134,7 @@ extern "C" {
|
|
|
135
134
|
WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
|
|
136
135
|
|
|
137
136
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
137
|
+
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
|
138
138
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
|
|
139
139
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
|
|
140
140
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);
|
package/cpp/ggml-impl.h
CHANGED
|
@@ -73,7 +73,7 @@ static inline int wsp_ggml_up(int n, int m) {
|
|
|
73
73
|
return (n + m - 1) & ~(m - 1);
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
-
// TODO: move to ggml.h?
|
|
76
|
+
// TODO: move to ggml.h? (won't be able to inline)
|
|
77
77
|
static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
|
|
78
78
|
if (a->type != b->type) {
|
|
79
79
|
return false;
|
|
@@ -89,6 +89,19 @@ static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const str
|
|
|
89
89
|
return true;
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
|
|
93
|
+
switch (op) {
|
|
94
|
+
case WSP_GGML_OP_NONE:
|
|
95
|
+
case WSP_GGML_OP_RESHAPE:
|
|
96
|
+
case WSP_GGML_OP_TRANSPOSE:
|
|
97
|
+
case WSP_GGML_OP_VIEW:
|
|
98
|
+
case WSP_GGML_OP_PERMUTE:
|
|
99
|
+
return true;
|
|
100
|
+
default:
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
92
105
|
//
|
|
93
106
|
// logging
|
|
94
107
|
//
|
|
@@ -329,6 +342,10 @@ struct wsp_ggml_cgraph {
|
|
|
329
342
|
// if you need the gradients, get them from the original graph
|
|
330
343
|
struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph, int i0, int i1);
|
|
331
344
|
|
|
345
|
+
// ggml-alloc.c: true if the operation can reuse memory from its sources
|
|
346
|
+
WSP_GGML_API bool wsp_ggml_op_can_inplace(enum wsp_ggml_op op);
|
|
347
|
+
|
|
348
|
+
|
|
332
349
|
// Memory allocation
|
|
333
350
|
|
|
334
351
|
WSP_GGML_API void * wsp_ggml_aligned_malloc(size_t size);
|
|
@@ -570,27 +587,27 @@ static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgrap
|
|
|
570
587
|
return true;
|
|
571
588
|
}
|
|
572
589
|
|
|
573
|
-
// Returns true if nodes
|
|
590
|
+
// Returns true if nodes with indices { node_idxs } are the sequence of wsp_ggml_ops in ops[]
|
|
574
591
|
// and are fusable. Nodes are considered fusable according to this function if:
|
|
575
592
|
// - all nodes except the last have only one use and are not views/outputs (see wsp_ggml_node_has_N_uses).
|
|
576
593
|
// - all nodes except the last are a src of the following node.
|
|
577
594
|
// - all nodes are the same shape.
|
|
578
595
|
// TODO: Consider allowing WSP_GGML_OP_NONE nodes in between
|
|
579
|
-
static inline bool
|
|
580
|
-
if (node_idx + num_ops > cgraph->n_nodes) {
|
|
581
|
-
return false;
|
|
582
|
-
}
|
|
583
|
-
|
|
596
|
+
static inline bool wsp_ggml_can_fuse_ext(const struct wsp_ggml_cgraph * cgraph, const int * node_idxs, const enum wsp_ggml_op * ops, int num_ops) {
|
|
584
597
|
for (int i = 0; i < num_ops; ++i) {
|
|
585
|
-
|
|
598
|
+
if (node_idxs[i] >= cgraph->n_nodes) {
|
|
599
|
+
return false;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
struct wsp_ggml_tensor * node = cgraph->nodes[node_idxs[i]];
|
|
586
603
|
if (node->op != ops[i]) {
|
|
587
604
|
return false;
|
|
588
605
|
}
|
|
589
|
-
if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph,
|
|
606
|
+
if (i < num_ops - 1 && !wsp_ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
|
|
590
607
|
return false;
|
|
591
608
|
}
|
|
592
609
|
if (i > 0) {
|
|
593
|
-
struct wsp_ggml_tensor * prev = cgraph->nodes[
|
|
610
|
+
struct wsp_ggml_tensor * prev = cgraph->nodes[node_idxs[i - 1]];
|
|
594
611
|
if (node->src[0] != prev && node->src[1] != prev) {
|
|
595
612
|
return false;
|
|
596
613
|
}
|
|
@@ -602,6 +619,22 @@ static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int
|
|
|
602
619
|
return true;
|
|
603
620
|
}
|
|
604
621
|
|
|
622
|
+
// same as above, for sequential indices starting at node_idx
|
|
623
|
+
static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_idx, const enum wsp_ggml_op * ops, int num_ops) {
|
|
624
|
+
assert(num_ops < 32);
|
|
625
|
+
|
|
626
|
+
if (node_idx + num_ops > cgraph->n_nodes) {
|
|
627
|
+
return false;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
int idxs[32];
|
|
631
|
+
for (int i = 0; i < num_ops; ++i) {
|
|
632
|
+
idxs[i] = node_idx + i;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
return wsp_ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
|
|
636
|
+
}
|
|
637
|
+
|
|
605
638
|
#ifdef __cplusplus
|
|
606
639
|
}
|
|
607
640
|
#endif
|