whisper.rn 0.5.0-rc.9 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/gradle.properties +1 -1
- package/cpp/ggml-alloc.c +265 -141
- package/cpp/ggml-backend-impl.h +4 -1
- package/cpp/ggml-backend-reg.cpp +30 -13
- package/cpp/ggml-backend.cpp +221 -38
- package/cpp/ggml-backend.h +17 -1
- package/cpp/ggml-common.h +17 -0
- package/cpp/ggml-cpu/amx/amx.cpp +4 -2
- package/cpp/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/ggml-cpu/arch-fallback.h +32 -2
- package/cpp/ggml-cpu/common.h +14 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +13 -6
- package/cpp/ggml-cpu/ggml-cpu.c +70 -42
- package/cpp/ggml-cpu/ggml-cpu.cpp +35 -28
- package/cpp/ggml-cpu/ops.cpp +1587 -1177
- package/cpp/ggml-cpu/ops.h +5 -8
- package/cpp/ggml-cpu/quants.c +35 -0
- package/cpp/ggml-cpu/quants.h +8 -0
- package/cpp/ggml-cpu/repack.cpp +458 -47
- package/cpp/ggml-cpu/repack.h +22 -0
- package/cpp/ggml-cpu/simd-mappings.h +89 -60
- package/cpp/ggml-cpu/traits.cpp +2 -2
- package/cpp/ggml-cpu/traits.h +1 -1
- package/cpp/ggml-cpu/vec.cpp +170 -26
- package/cpp/ggml-cpu/vec.h +506 -63
- package/cpp/ggml-cpu.h +1 -1
- package/cpp/ggml-impl.h +119 -9
- package/cpp/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/ggml-metal/ggml-metal-context.m +600 -0
- package/cpp/ggml-metal/ggml-metal-device.cpp +1376 -0
- package/cpp/ggml-metal/ggml-metal-device.h +226 -0
- package/cpp/ggml-metal/ggml-metal-device.m +1312 -0
- package/cpp/ggml-metal/ggml-metal-impl.h +722 -0
- package/cpp/ggml-metal/ggml-metal-ops.cpp +3158 -0
- package/cpp/ggml-metal/ggml-metal-ops.h +82 -0
- package/cpp/ggml-metal/ggml-metal.cpp +718 -0
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/cpp/ggml-metal-impl.h +90 -51
- package/cpp/ggml-metal.h +1 -6
- package/cpp/ggml-opt.cpp +97 -41
- package/cpp/ggml-opt.h +25 -6
- package/cpp/ggml-quants.c +111 -16
- package/cpp/ggml-quants.h +6 -0
- package/cpp/ggml.c +486 -98
- package/cpp/ggml.h +221 -16
- package/cpp/gguf.cpp +8 -1
- package/cpp/jsi/RNWhisperJSI.cpp +25 -6
- package/cpp/jsi/ThreadPool.h +3 -3
- package/cpp/whisper.cpp +100 -76
- package/cpp/whisper.h +1 -0
- package/ios/CMakeLists.txt +6 -1
- package/ios/RNWhisper.mm +6 -6
- package/ios/RNWhisperContext.mm +2 -0
- package/ios/RNWhisperVadContext.mm +16 -13
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +4 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +17 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +17 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +119 -9
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +90 -51
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +1 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +25 -6
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +6 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +221 -16
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js +13 -0
- package/lib/commonjs/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/realtime-transcription/RealtimeTranscriber.js +13 -0
- package/lib/module/realtime-transcription/RealtimeTranscriber.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/realtime-transcription/RealtimeTranscriber.d.ts.map +1 -1
- package/lib/typescript/realtime-transcription/types.d.ts +6 -0
- package/lib/typescript/realtime-transcription/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/realtime-transcription/RealtimeTranscriber.ts +17 -0
- package/src/realtime-transcription/types.ts +6 -0
- package/src/version.json +1 -1
- package/whisper-rn.podspec +8 -9
- package/cpp/ggml-metal.m +0 -6284
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
package/cpp/whisper.cpp
CHANGED
|
@@ -21,14 +21,12 @@
|
|
|
21
21
|
#define _USE_MATH_DEFINES
|
|
22
22
|
#include <cmath>
|
|
23
23
|
#include <climits>
|
|
24
|
-
#include <codecvt>
|
|
25
24
|
#include <cstdarg>
|
|
26
25
|
#include <cstdio>
|
|
27
26
|
#include <cstring>
|
|
28
27
|
#include <fstream>
|
|
29
28
|
#include <functional>
|
|
30
29
|
#include <map>
|
|
31
|
-
#include <mutex>
|
|
32
30
|
#include <random>
|
|
33
31
|
#include <regex>
|
|
34
32
|
#include <set>
|
|
@@ -36,6 +34,10 @@
|
|
|
36
34
|
#include <thread>
|
|
37
35
|
#include <vector>
|
|
38
36
|
|
|
37
|
+
#ifdef _MSC_VER
|
|
38
|
+
#include <codecvt>
|
|
39
|
+
#endif
|
|
40
|
+
|
|
39
41
|
#if defined(WHISPER_BIG_ENDIAN)
|
|
40
42
|
template<typename T>
|
|
41
43
|
static T byteswap(T value) {
|
|
@@ -138,6 +140,10 @@ static void whisper_log_callback_default(wsp_ggml_log_level level, const char *
|
|
|
138
140
|
} while (0)
|
|
139
141
|
|
|
140
142
|
#define WHISPER_MAX_DECODERS 8
|
|
143
|
+
|
|
144
|
+
// temperature below which we condition on past text history
|
|
145
|
+
static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
|
|
146
|
+
|
|
141
147
|
#define WHISPER_MAX_NODES 4096
|
|
142
148
|
|
|
143
149
|
static std::string format(const char * fmt, ...) {
|
|
@@ -252,45 +258,6 @@ static void whisper_set_i32_nd(struct wsp_ggml_tensor * t, int64_t i0, int64_t i
|
|
|
252
258
|
*(int32_t *) data = v;
|
|
253
259
|
}
|
|
254
260
|
|
|
255
|
-
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
|
256
|
-
// the idea is to represent the original matrix multiplication:
|
|
257
|
-
//
|
|
258
|
-
// Z = X @ Y
|
|
259
|
-
//
|
|
260
|
-
// with the sum of two matrix multiplications:
|
|
261
|
-
//
|
|
262
|
-
// Z = (X_0 @ Y_0) + (X_1 @ Y_1)
|
|
263
|
-
//
|
|
264
|
-
// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
|
|
265
|
-
// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
|
|
266
|
-
// general-purpose kernels
|
|
267
|
-
//
|
|
268
|
-
static struct wsp_ggml_tensor * wsp_ggml_mul_mat_pad(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * x, struct wsp_ggml_tensor * y, int pad = 32) {
|
|
269
|
-
// use padding only if dimension 0 is at least 8 times larger than the padding
|
|
270
|
-
// else we won't get much benefit from the optimization
|
|
271
|
-
const int n_pad_req = 8;
|
|
272
|
-
|
|
273
|
-
if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
|
|
274
|
-
return wsp_ggml_mul_mat(ctx, x, y);
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
struct wsp_ggml_tensor * x_0 = wsp_ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
|
|
278
|
-
struct wsp_ggml_tensor * x_1 = wsp_ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
|
|
279
|
-
|
|
280
|
-
struct wsp_ggml_tensor * y_0 = wsp_ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
|
|
281
|
-
struct wsp_ggml_tensor * y_1 = wsp_ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
|
|
282
|
-
|
|
283
|
-
return wsp_ggml_add(ctx,
|
|
284
|
-
wsp_ggml_mul_mat(ctx, x_0, y_0),
|
|
285
|
-
wsp_ggml_mul_mat(ctx, x_1, y_1));
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
// TODO: check if other platforms can benefit from this optimization
|
|
289
|
-
// TODO: CUDA is currently broken - seems wsp_ggml_mul_mat does not handle views correctly
|
|
290
|
-
#if defined(WSP_GGML_USE_METAL)
|
|
291
|
-
#define wsp_ggml_mul_mat wsp_ggml_mul_mat_pad
|
|
292
|
-
#endif
|
|
293
|
-
|
|
294
261
|
// available whisper models
|
|
295
262
|
enum e_model {
|
|
296
263
|
MODEL_UNKNOWN,
|
|
@@ -919,7 +886,10 @@ struct whisper_state {
|
|
|
919
886
|
std::vector<float> logits;
|
|
920
887
|
|
|
921
888
|
std::vector<whisper_segment> result_all;
|
|
922
|
-
|
|
889
|
+
|
|
890
|
+
// prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
|
|
891
|
+
std::vector<whisper_token> prompt_past0; // static carried initial prompt (if enabled)
|
|
892
|
+
std::vector<whisper_token> prompt_past1; // dynamic context from decoded output
|
|
923
893
|
|
|
924
894
|
int lang_id = 0; // english by default
|
|
925
895
|
|
|
@@ -1327,7 +1297,7 @@ static wsp_ggml_backend_t whisper_backend_init_gpu(const whisper_context_params
|
|
|
1327
1297
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1328
1298
|
wsp_ggml_backend_dev_t dev_cur = wsp_ggml_backend_dev_get(i);
|
|
1329
1299
|
if (wsp_ggml_backend_dev_type(dev_cur) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1330
|
-
if (cnt ==
|
|
1300
|
+
if (cnt == params.gpu_device) {
|
|
1331
1301
|
dev = dev_cur;
|
|
1332
1302
|
}
|
|
1333
1303
|
|
|
@@ -1396,7 +1366,7 @@ static buft_list_t make_buft_list(whisper_context_params & params) {
|
|
|
1396
1366
|
for (size_t i = 0; i < wsp_ggml_backend_dev_count(); ++i) {
|
|
1397
1367
|
wsp_ggml_backend_dev_t dev = wsp_ggml_backend_dev_get(i);
|
|
1398
1368
|
if (wsp_ggml_backend_dev_type(dev) == WSP_GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
1399
|
-
if (cnt ==
|
|
1369
|
+
if (cnt == params.gpu_device) {
|
|
1400
1370
|
auto * buft = wsp_ggml_backend_dev_buffer_type(dev);
|
|
1401
1371
|
if (buft) {
|
|
1402
1372
|
buft_list.emplace_back(dev, buft);
|
|
@@ -1438,7 +1408,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
|
|
|
1438
1408
|
op_supported = true;
|
|
1439
1409
|
} else {
|
|
1440
1410
|
switch (op) {
|
|
1441
|
-
// The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT
|
|
1411
|
+
// The current extra_buffer_type implementations only support WSP_GGML_OP_MUL_MAT and WSP_GGML_OP_GET_ROWS
|
|
1412
|
+
case WSP_GGML_OP_GET_ROWS:
|
|
1442
1413
|
case WSP_GGML_OP_MUL_MAT: {
|
|
1443
1414
|
wsp_ggml_init_params params = {
|
|
1444
1415
|
/*.mem_size =*/ 2 * wsp_ggml_tensor_overhead(),
|
|
@@ -1454,9 +1425,15 @@ static bool weight_buft_supported(const whisper_hparams & hparams, wsp_ggml_tens
|
|
|
1454
1425
|
|
|
1455
1426
|
wsp_ggml_tensor * op_tensor = nullptr;
|
|
1456
1427
|
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1428
|
+
if (op == WSP_GGML_OP_MUL_MAT) {
|
|
1429
|
+
int64_t n_ctx = hparams.n_audio_ctx;
|
|
1430
|
+
wsp_ggml_tensor * b = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
|
|
1431
|
+
op_tensor = wsp_ggml_mul_mat(ctx, w, b);
|
|
1432
|
+
} else if (op == WSP_GGML_OP_GET_ROWS) {
|
|
1433
|
+
int64_t num_indices = 8;
|
|
1434
|
+
wsp_ggml_tensor * indices = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, num_indices);
|
|
1435
|
+
op_tensor = wsp_ggml_get_rows(ctx, w, indices);
|
|
1436
|
+
}
|
|
1460
1437
|
|
|
1461
1438
|
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
|
1462
1439
|
WSP_GGML_ASSERT(w->buffer == nullptr);
|
|
@@ -2425,6 +2402,8 @@ static bool whisper_encode_internal(
|
|
|
2425
2402
|
return false;
|
|
2426
2403
|
}
|
|
2427
2404
|
} else {
|
|
2405
|
+
wsp_ggml_backend_sched_reset(sched);
|
|
2406
|
+
|
|
2428
2407
|
#if defined(WHISPER_USE_COREML)
|
|
2429
2408
|
whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) wstate.embd_enc->data);
|
|
2430
2409
|
#elif defined(WHISPER_USE_OPENVINO)
|
|
@@ -3626,7 +3605,7 @@ struct whisper_context_params whisper_context_default_params() {
|
|
|
3626
3605
|
struct whisper_context_params result = {
|
|
3627
3606
|
/*.use_gpu =*/ true,
|
|
3628
3607
|
/*.use_coreml =*/ false,
|
|
3629
|
-
/*.flash_attn =*/
|
|
3608
|
+
/*.flash_attn =*/ true,
|
|
3630
3609
|
/*.gpu_device =*/ 0,
|
|
3631
3610
|
|
|
3632
3611
|
/*.dtw_token_timestamps =*/ false,
|
|
@@ -4710,6 +4689,7 @@ static bool whisper_vad_init_context(whisper_vad_context * vctx) {
|
|
|
4710
4689
|
wsp_ggml_set_name(vctx->c_state, "c_state");
|
|
4711
4690
|
|
|
4712
4691
|
vctx->buffer = wsp_ggml_backend_alloc_ctx_tensors(ctx, vctx->backends[0]);
|
|
4692
|
+
wsp_ggml_free(ctx);
|
|
4713
4693
|
if (!vctx->buffer) {
|
|
4714
4694
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for the VAD state\n", __func__);
|
|
4715
4695
|
return false;
|
|
@@ -5454,6 +5434,9 @@ struct whisper_vad_segments * whisper_vad_segments_from_samples(
|
|
|
5454
5434
|
|
|
5455
5435
|
void whisper_vad_free(whisper_vad_context * ctx) {
|
|
5456
5436
|
if (ctx) {
|
|
5437
|
+
if (ctx->buffer) {
|
|
5438
|
+
wsp_ggml_backend_buffer_free(ctx->buffer);
|
|
5439
|
+
}
|
|
5457
5440
|
for (wsp_ggml_context * context : ctx->model.ctxs) {
|
|
5458
5441
|
wsp_ggml_free(context);
|
|
5459
5442
|
}
|
|
@@ -5468,6 +5451,9 @@ void whisper_vad_free(whisper_vad_context * ctx) {
|
|
|
5468
5451
|
wsp_ggml_backend_free(backend);
|
|
5469
5452
|
}
|
|
5470
5453
|
|
|
5454
|
+
delete[] ctx->model.hparams.encoder_in_channels;
|
|
5455
|
+
delete[] ctx->model.hparams.encoder_out_channels;
|
|
5456
|
+
delete[] ctx->model.hparams.kernel_sizes;
|
|
5471
5457
|
|
|
5472
5458
|
delete ctx;
|
|
5473
5459
|
}
|
|
@@ -5947,9 +5933,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
|
5947
5933
|
|
|
5948
5934
|
/* suppress_regex =*/ nullptr,
|
|
5949
5935
|
|
|
5950
|
-
/*.initial_prompt
|
|
5951
|
-
/*.
|
|
5952
|
-
/*.
|
|
5936
|
+
/*.initial_prompt =*/ nullptr,
|
|
5937
|
+
/*.carry_initial_prompt =*/ false,
|
|
5938
|
+
/*.prompt_tokens =*/ nullptr,
|
|
5939
|
+
/*.prompt_n_tokens =*/ 0,
|
|
5953
5940
|
|
|
5954
5941
|
/*.language =*/ "en",
|
|
5955
5942
|
/*.detect_language =*/ false,
|
|
@@ -6645,6 +6632,10 @@ static bool whisper_vad(
|
|
|
6645
6632
|
|
|
6646
6633
|
whisper_vad_segments * vad_segments = whisper_vad_segments_from_samples(vctx, vad_params, samples, n_samples);
|
|
6647
6634
|
|
|
6635
|
+
if (!vad_segments) {
|
|
6636
|
+
return false;
|
|
6637
|
+
}
|
|
6638
|
+
|
|
6648
6639
|
if (vad_segments->data.size() > 0) {
|
|
6649
6640
|
state->has_vad_segments = true;
|
|
6650
6641
|
ctx->state->vad_segments.clear();
|
|
@@ -6687,7 +6678,6 @@ static bool whisper_vad(
|
|
|
6687
6678
|
} catch (const std::bad_alloc & /* e */) {
|
|
6688
6679
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for filtered samples\n", __func__);
|
|
6689
6680
|
whisper_vad_free_segments(vad_segments);
|
|
6690
|
-
whisper_vad_free(vctx);
|
|
6691
6681
|
return false;
|
|
6692
6682
|
}
|
|
6693
6683
|
|
|
@@ -6793,6 +6783,7 @@ static bool whisper_vad(
|
|
|
6793
6783
|
__func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
|
|
6794
6784
|
}
|
|
6795
6785
|
|
|
6786
|
+
whisper_vad_free_segments(vad_segments);
|
|
6796
6787
|
return true;
|
|
6797
6788
|
}
|
|
6798
6789
|
|
|
@@ -6901,17 +6892,22 @@ int whisper_full_with_state(
|
|
|
6901
6892
|
decoder.rng = std::mt19937(j);
|
|
6902
6893
|
}
|
|
6903
6894
|
|
|
6904
|
-
// the accumulated text context
|
|
6905
|
-
auto &
|
|
6895
|
+
// the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
|
|
6896
|
+
auto & prompt_past0 = state->prompt_past0;
|
|
6897
|
+
auto & prompt_past1 = state->prompt_past1;
|
|
6906
6898
|
if (params.no_context) {
|
|
6907
|
-
|
|
6899
|
+
prompt_past0.clear();
|
|
6900
|
+
prompt_past1.clear();
|
|
6908
6901
|
}
|
|
6909
6902
|
|
|
6903
|
+
// calculate the maximum context budget for prompt history
|
|
6904
|
+
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
|
|
6905
|
+
|
|
6910
6906
|
// prepare prompt
|
|
6911
6907
|
{
|
|
6912
6908
|
std::vector<whisper_token> prompt_tokens;
|
|
6913
6909
|
|
|
6914
|
-
// initial prompt
|
|
6910
|
+
// tokenize the initial prompt
|
|
6915
6911
|
if (!params.prompt_tokens && params.initial_prompt) {
|
|
6916
6912
|
prompt_tokens.resize(1024);
|
|
6917
6913
|
int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
|
|
@@ -6923,14 +6919,25 @@ int whisper_full_with_state(
|
|
|
6923
6919
|
params.prompt_tokens = prompt_tokens.data();
|
|
6924
6920
|
params.prompt_n_tokens = prompt_tokens.size();
|
|
6925
6921
|
}
|
|
6926
|
-
|
|
6927
|
-
// prepend the prompt tokens to the prompt_past
|
|
6928
6922
|
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
|
|
6929
|
-
|
|
6930
|
-
|
|
6931
|
-
|
|
6923
|
+
if (params.carry_initial_prompt) {
|
|
6924
|
+
if (prompt_past0.empty()) {
|
|
6925
|
+
const int max_tokens = std::max(1, max_prompt_ctx - 1);
|
|
6926
|
+
|
|
6927
|
+
if (params.prompt_n_tokens > max_tokens) {
|
|
6928
|
+
WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
|
|
6929
|
+
__func__, params.prompt_n_tokens, max_tokens);
|
|
6930
|
+
}
|
|
6931
|
+
|
|
6932
|
+
const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
|
|
6933
|
+
prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
|
|
6934
|
+
}
|
|
6935
|
+
} else {
|
|
6936
|
+
for (int i = 0; i < params.prompt_n_tokens; ++i) {
|
|
6937
|
+
prompt_past1.push_back(params.prompt_tokens[i]);
|
|
6938
|
+
}
|
|
6939
|
+
std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
|
|
6932
6940
|
}
|
|
6933
|
-
std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
|
|
6934
6941
|
}
|
|
6935
6942
|
}
|
|
6936
6943
|
|
|
@@ -7016,7 +7023,8 @@ int whisper_full_with_state(
|
|
|
7016
7023
|
// if there is a very short audio segment left to process, we remove any past prompt since it tends
|
|
7017
7024
|
// to confuse the decoder and often make it repeat or hallucinate stuff
|
|
7018
7025
|
if (seek > seek_start && seek + 500 >= seek_end) {
|
|
7019
|
-
|
|
7026
|
+
prompt_past0.clear();
|
|
7027
|
+
prompt_past1.clear();
|
|
7020
7028
|
}
|
|
7021
7029
|
|
|
7022
7030
|
int best_decoder_id = 0;
|
|
@@ -7077,12 +7085,25 @@ int whisper_full_with_state(
|
|
|
7077
7085
|
{
|
|
7078
7086
|
prompt.clear();
|
|
7079
7087
|
|
|
7080
|
-
|
|
7081
|
-
|
|
7082
|
-
|
|
7088
|
+
if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
|
|
7089
|
+
const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
|
|
7090
|
+
const bool can_take1 = !prompt_past1.empty();
|
|
7083
7091
|
|
|
7084
|
-
|
|
7085
|
-
|
|
7092
|
+
if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
|
|
7093
|
+
// Always start with previous token marker to connect continuity
|
|
7094
|
+
prompt.push_back(whisper_token_prev(ctx));
|
|
7095
|
+
|
|
7096
|
+
// Take static tokens (initial prompt) first
|
|
7097
|
+
int n_take0 = 0;
|
|
7098
|
+
if (can_take0) {
|
|
7099
|
+
n_take0 = prompt_past0.size();
|
|
7100
|
+
prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
|
|
7101
|
+
}
|
|
7102
|
+
|
|
7103
|
+
// Fill remaining budget with dynamic tokens (rolling context)
|
|
7104
|
+
const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
|
|
7105
|
+
prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
|
|
7106
|
+
}
|
|
7086
7107
|
}
|
|
7087
7108
|
|
|
7088
7109
|
// init new transcription with sot, language (opt) and task tokens
|
|
@@ -7564,14 +7585,17 @@ int whisper_full_with_state(
|
|
|
7564
7585
|
|
|
7565
7586
|
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
|
7566
7587
|
|
|
7567
|
-
// update
|
|
7568
|
-
|
|
7569
|
-
if (prompt.front() == whisper_token_prev(ctx)) {
|
|
7570
|
-
|
|
7588
|
+
// update prompt_past1
|
|
7589
|
+
prompt_past1.clear();
|
|
7590
|
+
if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
|
|
7591
|
+
prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
|
|
7571
7592
|
}
|
|
7572
7593
|
|
|
7573
|
-
|
|
7574
|
-
|
|
7594
|
+
// Add newly decoded tokens to the rolling context
|
|
7595
|
+
if (!is_no_speech) {
|
|
7596
|
+
for (int i = 0; i < result_len; ++i) {
|
|
7597
|
+
prompt_past1.push_back(tokens_cur[i].id);
|
|
7598
|
+
}
|
|
7575
7599
|
}
|
|
7576
7600
|
|
|
7577
7601
|
if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {
|
|
@@ -8943,7 +8967,7 @@ void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
|
|
|
8943
8967
|
}
|
|
8944
8968
|
|
|
8945
8969
|
const char * whisper_version(void) {
|
|
8946
|
-
return "1.
|
|
8970
|
+
return "1.8.0";
|
|
8947
8971
|
}
|
|
8948
8972
|
|
|
8949
8973
|
WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
|
package/cpp/whisper.h
CHANGED
|
@@ -526,6 +526,7 @@ extern "C" {
|
|
|
526
526
|
// use whisper_tokenize() to convert text to tokens
|
|
527
527
|
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
|
528
528
|
const char * initial_prompt;
|
|
529
|
+
bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
|
|
529
530
|
const whisper_token * prompt_tokens;
|
|
530
531
|
int prompt_n_tokens;
|
|
531
532
|
|
package/ios/CMakeLists.txt
CHANGED
|
@@ -55,7 +55,12 @@ add_library(rnwhisper SHARED
|
|
|
55
55
|
${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
|
|
56
56
|
${SOURCE_DIR}/ggml-cpu/vec.cpp
|
|
57
57
|
${SOURCE_DIR}/ggml-cpu/ops.cpp
|
|
58
|
-
${SOURCE_DIR}/ggml-metal.
|
|
58
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal.cpp
|
|
59
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal-common.cpp
|
|
60
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal-device.cpp
|
|
61
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal-context.m
|
|
62
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal-device.m
|
|
63
|
+
${SOURCE_DIR}/ggml-metal/ggml-metal-ops.cpp
|
|
59
64
|
${SOURCE_DIR}/ggml-opt.cpp
|
|
60
65
|
${SOURCE_DIR}/ggml-threading.cpp
|
|
61
66
|
${SOURCE_DIR}/ggml-quants.c
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -357,10 +357,9 @@ RCT_REMAP_METHOD(releaseContext,
|
|
|
357
357
|
reject(@"whisper_error", @"Context not found", nil);
|
|
358
358
|
return;
|
|
359
359
|
}
|
|
360
|
-
[context invalidate];
|
|
361
|
-
[contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
362
|
-
// Also remove from unified context management
|
|
363
360
|
rnwhisper_jsi::removeContext(contextId);
|
|
361
|
+
[contexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
362
|
+
[context invalidate];
|
|
364
363
|
resolve(nil);
|
|
365
364
|
}
|
|
366
365
|
|
|
@@ -555,10 +554,9 @@ RCT_REMAP_METHOD(releaseVadContext,
|
|
|
555
554
|
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
556
555
|
return;
|
|
557
556
|
}
|
|
558
|
-
[vadContext invalidate];
|
|
559
|
-
[vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
560
|
-
// Also remove from unified context management
|
|
561
557
|
rnwhisper_jsi::removeVadContext(contextId);
|
|
558
|
+
[vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
559
|
+
[vadContext invalidate];
|
|
562
560
|
resolve(nil);
|
|
563
561
|
}
|
|
564
562
|
|
|
@@ -574,6 +572,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
|
|
|
574
572
|
if (contexts != nil) {
|
|
575
573
|
for (NSNumber *contextId in contexts) {
|
|
576
574
|
RNWhisperContext *context = contexts[contextId];
|
|
575
|
+
rnwhisper_jsi::removeContext([contextId intValue]);
|
|
577
576
|
[context invalidate];
|
|
578
577
|
}
|
|
579
578
|
[contexts removeAllObjects];
|
|
@@ -585,6 +584,7 @@ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
|
|
|
585
584
|
if (vadContexts != nil) {
|
|
586
585
|
for (NSNumber *contextId in vadContexts) {
|
|
587
586
|
RNWhisperVadContext *vadContext = vadContexts[contextId];
|
|
587
|
+
rnwhisper_jsi::removeVadContext([contextId intValue]);
|
|
588
588
|
[vadContext invalidate];
|
|
589
589
|
}
|
|
590
590
|
[vadContexts removeAllObjects];
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -20,25 +20,28 @@
|
|
|
20
20
|
|
|
21
21
|
#ifdef WSP_GGML_USE_METAL
|
|
22
22
|
if (ctx_params.use_gpu) {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
//
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
}
|
|
23
|
+
// TODO: GPU VAD is forced disabled until the performance is improved (ref: whisper.cpp/whisper_vad_init_context)
|
|
24
|
+
ctx_params.use_gpu = false;
|
|
25
|
+
// ctx_params.gpu_device = 0;
|
|
26
|
+
|
|
27
|
+
// id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
28
|
+
|
|
29
|
+
// // Check ggml-metal availability
|
|
30
|
+
// BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
|
31
|
+
// if (@available(iOS 16.0, tvOS 16.0, *)) {
|
|
32
|
+
// supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
|
33
|
+
// }
|
|
34
|
+
// if (!supportsGgmlMetal) {
|
|
35
|
+
// ctx_params.use_gpu = false;
|
|
36
|
+
// reasonNoMetal = @"Metal is not supported in this device";
|
|
37
|
+
// }
|
|
38
|
+
// device = nil;
|
|
34
39
|
|
|
35
40
|
#if TARGET_OS_SIMULATOR
|
|
36
41
|
// Use the backend, but no layers because not supported fully on simulator
|
|
37
42
|
ctx_params.use_gpu = false;
|
|
38
43
|
reasonNoMetal = @"Metal is not supported in simulator";
|
|
39
44
|
#endif
|
|
40
|
-
|
|
41
|
-
device = nil;
|
|
42
45
|
}
|
|
43
46
|
#endif // WSP_GGML_USE_METAL
|
|
44
47
|
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
extern "C" {
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
|
-
#define WSP_GGML_BACKEND_API_VERSION
|
|
11
|
+
#define WSP_GGML_BACKEND_API_VERSION 2
|
|
12
12
|
|
|
13
13
|
//
|
|
14
14
|
// Backend buffer type
|
|
@@ -114,6 +114,9 @@ extern "C" {
|
|
|
114
114
|
void (*event_record)(wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
|
|
115
115
|
// wait for an event on on a different stream
|
|
116
116
|
void (*event_wait) (wsp_ggml_backend_t backend, wsp_ggml_backend_event_t event);
|
|
117
|
+
|
|
118
|
+
// (optional) sort/optimize the nodes in the graph
|
|
119
|
+
void (*graph_optimize) (wsp_ggml_backend_t backend, struct wsp_ggml_cgraph * cgraph);
|
|
117
120
|
};
|
|
118
121
|
|
|
119
122
|
struct wsp_ggml_backend {
|
|
@@ -132,6 +132,8 @@ extern "C" {
|
|
|
132
132
|
WSP_GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
133
|
// GPU device using dedicated memory
|
|
134
134
|
WSP_GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
135
|
+
// integrated GPU device using host memory
|
|
136
|
+
WSP_GGML_BACKEND_DEVICE_TYPE_IGPU,
|
|
135
137
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
138
|
WSP_GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
137
139
|
};
|
|
@@ -150,11 +152,21 @@ extern "C" {
|
|
|
150
152
|
|
|
151
153
|
// all the device properties
|
|
152
154
|
struct wsp_ggml_backend_dev_props {
|
|
155
|
+
// device name
|
|
153
156
|
const char * name;
|
|
157
|
+
// device description
|
|
154
158
|
const char * description;
|
|
159
|
+
// device free memory in bytes
|
|
155
160
|
size_t memory_free;
|
|
161
|
+
// device total memory in bytes
|
|
156
162
|
size_t memory_total;
|
|
163
|
+
// device type
|
|
157
164
|
enum wsp_ggml_backend_dev_type type;
|
|
165
|
+
// device id
|
|
166
|
+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
|
167
|
+
// if the id is unknown, this should be NULL
|
|
168
|
+
const char * device_id;
|
|
169
|
+
// device capabilities
|
|
158
170
|
struct wsp_ggml_backend_dev_caps caps;
|
|
159
171
|
};
|
|
160
172
|
|
|
@@ -302,11 +314,15 @@ extern "C" {
|
|
|
302
314
|
WSP_GGML_API int wsp_ggml_backend_sched_get_n_splits(wsp_ggml_backend_sched_t sched);
|
|
303
315
|
WSP_GGML_API int wsp_ggml_backend_sched_get_n_copies(wsp_ggml_backend_sched_t sched);
|
|
304
316
|
|
|
305
|
-
WSP_GGML_API
|
|
317
|
+
WSP_GGML_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_sched_get_buffer_type(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
|
|
318
|
+
WSP_GGML_API size_t wsp_ggml_backend_sched_get_buffer_size(wsp_ggml_backend_sched_t sched, wsp_ggml_backend_t backend);
|
|
306
319
|
|
|
307
320
|
WSP_GGML_API void wsp_ggml_backend_sched_set_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node, wsp_ggml_backend_t backend);
|
|
308
321
|
WSP_GGML_API wsp_ggml_backend_t wsp_ggml_backend_sched_get_tensor_backend(wsp_ggml_backend_sched_t sched, struct wsp_ggml_tensor * node);
|
|
309
322
|
|
|
323
|
+
// Split graph without allocating it
|
|
324
|
+
WSP_GGML_API void wsp_ggml_backend_sched_split_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
|
|
325
|
+
|
|
310
326
|
// Allocate and compute graph on the backend scheduler
|
|
311
327
|
WSP_GGML_API bool wsp_ggml_backend_sched_alloc_graph(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph); // returns success
|
|
312
328
|
WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_sched_graph_compute(wsp_ggml_backend_sched_t sched, struct wsp_ggml_cgraph * graph);
|
|
@@ -99,6 +99,9 @@ typedef sycl::half2 wsp_ggml_half2;
|
|
|
99
99
|
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
|
100
100
|
#define QR4_1 2
|
|
101
101
|
|
|
102
|
+
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
|
|
103
|
+
#define QR_MXFP4 2
|
|
104
|
+
|
|
102
105
|
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
|
103
106
|
#define QR5_0 2
|
|
104
107
|
|
|
@@ -184,6 +187,13 @@ typedef struct {
|
|
|
184
187
|
} block_q4_1;
|
|
185
188
|
static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
|
186
189
|
|
|
190
|
+
#define QK_MXFP4 32
|
|
191
|
+
typedef struct {
|
|
192
|
+
uint8_t e; // E8M0
|
|
193
|
+
uint8_t qs[QK_MXFP4/2];
|
|
194
|
+
} block_mxfp4;
|
|
195
|
+
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
|
|
196
|
+
|
|
187
197
|
#define QK5_0 32
|
|
188
198
|
typedef struct {
|
|
189
199
|
wsp_ggml_half d; // delta
|
|
@@ -1074,10 +1084,17 @@ WSP_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
|
|
1074
1084
|
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
1075
1085
|
WSP_GGML_TABLE_END()
|
|
1076
1086
|
|
|
1087
|
+
// TODO: fix name to kvalues_iq4_nl
|
|
1077
1088
|
WSP_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
|
1078
1089
|
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
|
1079
1090
|
WSP_GGML_TABLE_END()
|
|
1080
1091
|
|
|
1092
|
+
// e2m1 values (doubled)
|
|
1093
|
+
// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
|
1094
|
+
WSP_GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
|
|
1095
|
+
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
|
|
1096
|
+
WSP_GGML_TABLE_END()
|
|
1097
|
+
|
|
1081
1098
|
#define NGRID_IQ1S 2048
|
|
1082
1099
|
#define IQ1S_DELTA 0.125f
|
|
1083
1100
|
#define IQ1M_DELTA 0.125f
|
|
@@ -101,7 +101,6 @@ extern "C" {
|
|
|
101
101
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v (void);
|
|
102
102
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx (void);
|
|
103
103
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe (void);
|
|
104
|
-
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_nnpa (void);
|
|
105
104
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd (void);
|
|
106
105
|
WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile (void);
|
|
107
106
|
|
|
@@ -135,6 +134,7 @@ extern "C" {
|
|
|
135
134
|
WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
|
|
136
135
|
|
|
137
136
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
137
|
+
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
|
138
138
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_fp16(const float *, wsp_ggml_fp16_t *, int64_t);
|
|
139
139
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp16_to_fp32(const wsp_ggml_fp16_t *, float *, int64_t);
|
|
140
140
|
WSP_GGML_BACKEND_API void wsp_ggml_cpu_fp32_to_bf16(const float *, wsp_ggml_bf16_t *, int64_t);
|