whisper.rn 0.4.0-rc.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/android/build.gradle +12 -3
- package/android/src/main/CMakeLists.txt +44 -13
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -38
- package/android/src/main/jni.cpp +38 -1
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/cpp/coreml/whisper-compat.h +10 -0
- package/cpp/coreml/whisper-compat.m +35 -0
- package/cpp/coreml/whisper-decoder-impl.h +27 -15
- package/cpp/coreml/whisper-decoder-impl.m +36 -10
- package/cpp/coreml/whisper-encoder-impl.h +21 -9
- package/cpp/coreml/whisper-encoder-impl.m +29 -3
- package/cpp/ggml-alloc.c +727 -517
- package/cpp/ggml-alloc.h +47 -65
- package/cpp/ggml-backend-impl.h +196 -57
- package/cpp/ggml-backend-reg.cpp +591 -0
- package/cpp/ggml-backend.cpp +2016 -0
- package/cpp/ggml-backend.h +234 -89
- package/cpp/ggml-common.h +1861 -0
- package/cpp/ggml-cpp.h +39 -0
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2162 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/ggml-cpu/arch/x86/repack.cpp +3284 -0
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/binary-ops.cpp +158 -0
- package/cpp/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml-cpu/common.h +72 -0
- package/cpp/ggml-cpu/ggml-cpu-impl.h +511 -0
- package/cpp/ggml-cpu/ggml-cpu.c +3473 -0
- package/cpp/ggml-cpu/ggml-cpu.cpp +671 -0
- package/cpp/ggml-cpu/ops.cpp +9085 -0
- package/cpp/ggml-cpu/ops.h +111 -0
- package/cpp/ggml-cpu/quants.c +1157 -0
- package/cpp/ggml-cpu/quants.h +89 -0
- package/cpp/ggml-cpu/repack.cpp +1570 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +1006 -0
- package/cpp/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml-cpu/traits.h +38 -0
- package/cpp/ggml-cpu/unary-ops.cpp +186 -0
- package/cpp/ggml-cpu/unary-ops.h +28 -0
- package/cpp/ggml-cpu/vec.cpp +321 -0
- package/cpp/ggml-cpu/vec.h +973 -0
- package/cpp/ggml-cpu.h +143 -0
- package/cpp/ggml-impl.h +525 -168
- package/cpp/ggml-metal-impl.h +622 -0
- package/cpp/ggml-metal.h +16 -14
- package/cpp/ggml-metal.m +5289 -1859
- package/cpp/ggml-opt.cpp +1037 -0
- package/cpp/ggml-opt.h +237 -0
- package/cpp/ggml-quants.c +2916 -6877
- package/cpp/ggml-quants.h +87 -249
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +14 -0
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +3293 -16770
- package/cpp/ggml.h +778 -835
- package/cpp/gguf.cpp +1347 -0
- package/cpp/gguf.h +202 -0
- package/cpp/rn-whisper.cpp +84 -0
- package/cpp/rn-whisper.h +2 -0
- package/cpp/whisper-arch.h +197 -0
- package/cpp/whisper.cpp +3240 -944
- package/cpp/whisper.h +144 -31
- package/ios/CMakeLists.txt +95 -0
- package/ios/RNWhisper.h +5 -0
- package/ios/RNWhisper.mm +124 -37
- package/ios/RNWhisperAudioUtils.h +1 -0
- package/ios/RNWhisperAudioUtils.m +24 -13
- package/ios/RNWhisperContext.h +8 -2
- package/ios/RNWhisperContext.mm +42 -8
- package/ios/rnwhisper.xcframework/Info.plist +74 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-common.h +1861 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +603 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-opt.h +237 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +2221 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/gguf.h +202 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +52 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper-arch.h +197 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +739 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-common.h +1861 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +143 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +603 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +622 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-opt.h +237 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +2221 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/gguf.h +202 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-audioutils.h +14 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper-log.h +11 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +52 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper-arch.h +197 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +739 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +101 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/jest/mock.js +14 -1
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +48 -19
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +48 -19
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +6 -3
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +25 -3
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +15 -10
- package/src/NativeRNWhisper.ts +12 -3
- package/src/index.ts +63 -24
- package/src/version.json +1 -1
- package/whisper-rn.podspec +18 -18
- package/cpp/README.md +0 -4
- package/cpp/ggml-backend.c +0 -1718
- package/cpp/ggml-metal-whisper.metal +0 -5820
package/cpp/whisper.h
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#define WHISPER_H
|
|
3
3
|
|
|
4
4
|
#include "ggml.h"
|
|
5
|
+
#include "ggml-cpu.h"
|
|
5
6
|
|
|
6
7
|
#include <stddef.h>
|
|
7
8
|
#include <stdint.h>
|
|
@@ -84,9 +85,48 @@ extern "C" {
|
|
|
84
85
|
typedef int32_t whisper_token;
|
|
85
86
|
typedef int32_t whisper_seq_id;
|
|
86
87
|
|
|
88
|
+
enum whisper_alignment_heads_preset {
|
|
89
|
+
WHISPER_AHEADS_NONE,
|
|
90
|
+
WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
|
|
91
|
+
WHISPER_AHEADS_CUSTOM,
|
|
92
|
+
WHISPER_AHEADS_TINY_EN,
|
|
93
|
+
WHISPER_AHEADS_TINY,
|
|
94
|
+
WHISPER_AHEADS_BASE_EN,
|
|
95
|
+
WHISPER_AHEADS_BASE,
|
|
96
|
+
WHISPER_AHEADS_SMALL_EN,
|
|
97
|
+
WHISPER_AHEADS_SMALL,
|
|
98
|
+
WHISPER_AHEADS_MEDIUM_EN,
|
|
99
|
+
WHISPER_AHEADS_MEDIUM,
|
|
100
|
+
WHISPER_AHEADS_LARGE_V1,
|
|
101
|
+
WHISPER_AHEADS_LARGE_V2,
|
|
102
|
+
WHISPER_AHEADS_LARGE_V3,
|
|
103
|
+
WHISPER_AHEADS_LARGE_V3_TURBO,
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
typedef struct whisper_ahead {
|
|
107
|
+
int n_text_layer;
|
|
108
|
+
int n_head;
|
|
109
|
+
} whisper_ahead;
|
|
110
|
+
|
|
111
|
+
typedef struct whisper_aheads {
|
|
112
|
+
size_t n_heads;
|
|
113
|
+
const whisper_ahead * heads;
|
|
114
|
+
} whisper_aheads;
|
|
115
|
+
|
|
87
116
|
struct whisper_context_params {
|
|
88
117
|
bool use_gpu;
|
|
89
118
|
bool use_coreml;
|
|
119
|
+
bool flash_attn;
|
|
120
|
+
int gpu_device; // CUDA device
|
|
121
|
+
|
|
122
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
123
|
+
bool dtw_token_timestamps;
|
|
124
|
+
enum whisper_alignment_heads_preset dtw_aheads_preset;
|
|
125
|
+
|
|
126
|
+
int dtw_n_top;
|
|
127
|
+
struct whisper_aheads dtw_aheads;
|
|
128
|
+
|
|
129
|
+
size_t dtw_mem_size; // TODO: remove
|
|
90
130
|
};
|
|
91
131
|
|
|
92
132
|
typedef struct whisper_token_data {
|
|
@@ -103,6 +143,11 @@ extern "C" {
|
|
|
103
143
|
int64_t t0; // start time of the token
|
|
104
144
|
int64_t t1; // end time of the token
|
|
105
145
|
|
|
146
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
147
|
+
// do not use if you haven't computed token-level timestamps with dtw
|
|
148
|
+
// Roughly corresponds to the moment in audio in which the token was output
|
|
149
|
+
int64_t t_dtw;
|
|
150
|
+
|
|
106
151
|
float vlen; // voice length of the token
|
|
107
152
|
} whisper_token_data;
|
|
108
153
|
|
|
@@ -145,6 +190,15 @@ extern "C" {
|
|
|
145
190
|
uint32_t value; // Unicode code point or rule ID
|
|
146
191
|
} whisper_grammar_element;
|
|
147
192
|
|
|
193
|
+
typedef struct whisper_vad_params {
|
|
194
|
+
float threshold; // Probability threshold to consider as speech.
|
|
195
|
+
int min_speech_duration_ms; // Min duration for a valid speech segment.
|
|
196
|
+
int min_silence_duration_ms; // Min silence duration to consider speech as ended.
|
|
197
|
+
float max_speech_duration_s; // Max duration of a speech segment before forcing a new segment.
|
|
198
|
+
int speech_pad_ms; // Padding added before and after speech segments.
|
|
199
|
+
float samples_overlap; // Overlap in seconds when copying audio samples from speech segment.
|
|
200
|
+
} whisper_vad_params;
|
|
201
|
+
|
|
148
202
|
// Various functions for loading a ggml whisper model.
|
|
149
203
|
// Allocate (almost) all memory needed for the model.
|
|
150
204
|
// Return NULL on failure
|
|
@@ -196,6 +250,13 @@ extern "C" {
|
|
|
196
250
|
// GPU, by caching compiled 'blobs' there.
|
|
197
251
|
// Set to nullptr if not used.
|
|
198
252
|
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
|
253
|
+
WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
|
|
254
|
+
struct whisper_context * ctx,
|
|
255
|
+
struct whisper_state * state,
|
|
256
|
+
const char * model_path,
|
|
257
|
+
const char * device,
|
|
258
|
+
const char * cache_dir);
|
|
259
|
+
|
|
199
260
|
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
|
200
261
|
struct whisper_context * ctx,
|
|
201
262
|
const char * model_path,
|
|
@@ -224,22 +285,6 @@ extern "C" {
|
|
|
224
285
|
int n_samples,
|
|
225
286
|
int n_threads);
|
|
226
287
|
|
|
227
|
-
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
|
228
|
-
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
|
229
|
-
// Returns 0 on success
|
|
230
|
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
|
231
|
-
struct whisper_context * ctx,
|
|
232
|
-
const float * samples,
|
|
233
|
-
int n_samples,
|
|
234
|
-
int n_threads);
|
|
235
|
-
|
|
236
|
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
|
237
|
-
struct whisper_context * ctx,
|
|
238
|
-
struct whisper_state * state,
|
|
239
|
-
const float * samples,
|
|
240
|
-
int n_samples,
|
|
241
|
-
int n_threads);
|
|
242
|
-
|
|
243
288
|
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
|
244
289
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
|
245
290
|
// n_mel must be 80
|
|
@@ -296,7 +341,7 @@ extern "C" {
|
|
|
296
341
|
// Convert the provided text into tokens.
|
|
297
342
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
|
298
343
|
// Returns the number of tokens on success, no more than n_max_tokens
|
|
299
|
-
// Returns
|
|
344
|
+
// Returns a negative number on failure - the number of tokens that would have been returned
|
|
300
345
|
// TODO: not sure if correct
|
|
301
346
|
WHISPER_API int whisper_tokenize(
|
|
302
347
|
struct whisper_context * ctx,
|
|
@@ -304,8 +349,12 @@ extern "C" {
|
|
|
304
349
|
whisper_token * tokens,
|
|
305
350
|
int n_max_tokens);
|
|
306
351
|
|
|
352
|
+
// Return the number of tokens in the provided text
|
|
353
|
+
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
|
|
354
|
+
int whisper_token_count(struct whisper_context * ctx, const char * text);
|
|
355
|
+
|
|
307
356
|
// Largest language id (i.e. number of available languages - 1)
|
|
308
|
-
WHISPER_API int whisper_lang_max_id();
|
|
357
|
+
WHISPER_API int whisper_lang_max_id(void);
|
|
309
358
|
|
|
310
359
|
// Return the id of the specified language, returns -1 if not found
|
|
311
360
|
// Examples:
|
|
@@ -385,6 +434,14 @@ extern "C" {
|
|
|
385
434
|
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
|
386
435
|
|
|
387
436
|
// Performance information from the default state.
|
|
437
|
+
struct whisper_timings {
|
|
438
|
+
float sample_ms;
|
|
439
|
+
float encode_ms;
|
|
440
|
+
float decode_ms;
|
|
441
|
+
float batchd_ms;
|
|
442
|
+
float prompt_ms;
|
|
443
|
+
};
|
|
444
|
+
WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
|
|
388
445
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
|
389
446
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
|
390
447
|
|
|
@@ -412,11 +469,6 @@ extern "C" {
|
|
|
412
469
|
// If it returns false, the computation is aborted
|
|
413
470
|
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
|
|
414
471
|
|
|
415
|
-
// Abort callback
|
|
416
|
-
// If not NULL, called before ggml computation
|
|
417
|
-
// If it returns true, the computation is aborted
|
|
418
|
-
typedef bool (*whisper_abort_callback)(void * user_data);
|
|
419
|
-
|
|
420
472
|
// Logits filter callback
|
|
421
473
|
// Can be used to modify the logits before sampling
|
|
422
474
|
// If not NULL, called after applying temperature to logits
|
|
@@ -458,15 +510,19 @@ extern "C" {
|
|
|
458
510
|
|
|
459
511
|
// [EXPERIMENTAL] speed-up techniques
|
|
460
512
|
// note: these can significantly reduce the quality of the output
|
|
461
|
-
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
|
462
513
|
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
|
463
514
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
|
464
515
|
|
|
465
516
|
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
|
466
517
|
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
|
467
518
|
|
|
519
|
+
// A regular expression that matches tokens to suppress
|
|
520
|
+
const char * suppress_regex;
|
|
521
|
+
|
|
468
522
|
// tokens to provide to the whisper decoder as initial prompt
|
|
469
523
|
// these are prepended to any existing text context from a previous call
|
|
524
|
+
// use whisper_tokenize() to convert text to tokens
|
|
525
|
+
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
|
470
526
|
const char * initial_prompt;
|
|
471
527
|
const whisper_token * prompt_tokens;
|
|
472
528
|
int prompt_n_tokens;
|
|
@@ -476,8 +532,8 @@ extern "C" {
|
|
|
476
532
|
bool detect_language;
|
|
477
533
|
|
|
478
534
|
// common decoding parameters:
|
|
479
|
-
bool suppress_blank;
|
|
480
|
-
bool
|
|
535
|
+
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
|
536
|
+
bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
|
481
537
|
|
|
482
538
|
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
|
|
483
539
|
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
|
@@ -488,7 +544,7 @@ extern "C" {
|
|
|
488
544
|
float temperature_inc;
|
|
489
545
|
float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
|
|
490
546
|
float logprob_thold;
|
|
491
|
-
float no_speech_thold;
|
|
547
|
+
float no_speech_thold;
|
|
492
548
|
|
|
493
549
|
struct {
|
|
494
550
|
int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
|
|
@@ -513,7 +569,7 @@ extern "C" {
|
|
|
513
569
|
void * encoder_begin_callback_user_data;
|
|
514
570
|
|
|
515
571
|
// called each time before ggml computation starts
|
|
516
|
-
|
|
572
|
+
wsp_ggml_abort_callback abort_callback;
|
|
517
573
|
void * abort_callback_user_data;
|
|
518
574
|
|
|
519
575
|
// called by each decoder to filter obtained logits
|
|
@@ -524,13 +580,20 @@ extern "C" {
|
|
|
524
580
|
size_t n_grammar_rules;
|
|
525
581
|
size_t i_start_rule;
|
|
526
582
|
float grammar_penalty;
|
|
583
|
+
|
|
584
|
+
// Voice Activity Detection (VAD) params
|
|
585
|
+
bool vad; // Enable VAD
|
|
586
|
+
const char * vad_model_path; // Path to VAD model
|
|
587
|
+
|
|
588
|
+
whisper_vad_params vad_params;
|
|
527
589
|
};
|
|
528
590
|
|
|
529
591
|
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
|
530
|
-
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
|
|
531
|
-
WHISPER_API struct whisper_context_params
|
|
592
|
+
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
|
|
593
|
+
WHISPER_API struct whisper_context_params whisper_context_default_params (void);
|
|
594
|
+
|
|
532
595
|
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
|
|
533
|
-
WHISPER_API struct whisper_full_params
|
|
596
|
+
WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
|
|
534
597
|
|
|
535
598
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
|
536
599
|
// Not thread safe for same context
|
|
@@ -606,6 +669,53 @@ extern "C" {
|
|
|
606
669
|
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
|
|
607
670
|
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
|
|
608
671
|
|
|
672
|
+
//
|
|
673
|
+
// Voice Activity Detection (VAD)
|
|
674
|
+
//
|
|
675
|
+
|
|
676
|
+
struct whisper_vad_context;
|
|
677
|
+
|
|
678
|
+
WHISPER_API struct whisper_vad_params whisper_vad_default_params(void);
|
|
679
|
+
|
|
680
|
+
struct whisper_vad_context_params {
|
|
681
|
+
int n_threads; // The number of threads to use for processing.
|
|
682
|
+
bool use_gpu;
|
|
683
|
+
int gpu_device; // CUDA device
|
|
684
|
+
};
|
|
685
|
+
|
|
686
|
+
WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
|
|
687
|
+
|
|
688
|
+
WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(const char * path_model, struct whisper_vad_context_params params);
|
|
689
|
+
WHISPER_API struct whisper_vad_context * whisper_vad_init_with_params (struct whisper_model_loader * loader, struct whisper_vad_context_params params);
|
|
690
|
+
|
|
691
|
+
WHISPER_API bool whisper_vad_detect_speech(
|
|
692
|
+
struct whisper_vad_context * vctx,
|
|
693
|
+
const float * samples,
|
|
694
|
+
int n_samples);
|
|
695
|
+
|
|
696
|
+
WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx);
|
|
697
|
+
WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx);
|
|
698
|
+
|
|
699
|
+
struct whisper_vad_segments;
|
|
700
|
+
|
|
701
|
+
WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_probs(
|
|
702
|
+
struct whisper_vad_context * vctx,
|
|
703
|
+
struct whisper_vad_params params);
|
|
704
|
+
|
|
705
|
+
WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_samples(
|
|
706
|
+
struct whisper_vad_context * vctx,
|
|
707
|
+
struct whisper_vad_params params,
|
|
708
|
+
const float * samples,
|
|
709
|
+
int n_samples);
|
|
710
|
+
|
|
711
|
+
WHISPER_API int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments);
|
|
712
|
+
|
|
713
|
+
WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment);
|
|
714
|
+
WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment);
|
|
715
|
+
|
|
716
|
+
WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments);
|
|
717
|
+
WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx);
|
|
718
|
+
|
|
609
719
|
////////////////////////////////////////////////////////////////////////////
|
|
610
720
|
|
|
611
721
|
// Temporary helpers needed for exposing ggml interface
|
|
@@ -619,6 +729,9 @@ extern "C" {
|
|
|
619
729
|
|
|
620
730
|
WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
|
|
621
731
|
|
|
732
|
+
// Get the no_speech probability for the specified segment
|
|
733
|
+
WHISPER_API float whisper_full_get_segment_no_speech_prob (struct whisper_context * ctx, int i_segment);
|
|
734
|
+
WHISPER_API float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment);
|
|
622
735
|
#ifdef __cplusplus
|
|
623
736
|
}
|
|
624
737
|
#endif
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.16)
|
|
2
|
+
project(rnwhisper VERSION 1.0.0 LANGUAGES CXX C)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
|
|
7
|
+
# iOS specific settings
|
|
8
|
+
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
|
|
9
|
+
set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO)
|
|
10
|
+
|
|
11
|
+
# Dependencies and compile options
|
|
12
|
+
add_definitions(
|
|
13
|
+
-DNDEBUG
|
|
14
|
+
-DO3
|
|
15
|
+
-DWSP_GGML_USE_CPU
|
|
16
|
+
-DWSP_GGML_USE_ACCELERATE
|
|
17
|
+
-DWSP_GGML_USE_METAL
|
|
18
|
+
-DWSP_GGML_METAL_USE_BF16
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64;x86_64")
|
|
22
|
+
add_definitions(-DWSP_GGML_CPU_GENERIC)
|
|
23
|
+
endif ()
|
|
24
|
+
|
|
25
|
+
set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp)
|
|
26
|
+
|
|
27
|
+
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
|
|
28
|
+
set(SOURCE_FILES_ARCH
|
|
29
|
+
${SOURCE_DIR}/ggml-cpu/arch/arm/quants.c
|
|
30
|
+
${SOURCE_DIR}/ggml-cpu/arch/arm/repack.cpp
|
|
31
|
+
)
|
|
32
|
+
endif ()
|
|
33
|
+
|
|
34
|
+
# Define public headers
|
|
35
|
+
set(PUBLIC_HEADERS
|
|
36
|
+
${SOURCE_DIR}/rn-whisper.h
|
|
37
|
+
${SOURCE_DIR}/whisper.h
|
|
38
|
+
${SOURCE_DIR}/ggml.h
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Create library target
|
|
42
|
+
add_library(rnwhisper SHARED
|
|
43
|
+
${SOURCE_DIR}/ggml.c
|
|
44
|
+
${SOURCE_DIR}/ggml-alloc.c
|
|
45
|
+
${SOURCE_DIR}/ggml-backend.cpp
|
|
46
|
+
${SOURCE_DIR}/ggml-backend-reg.cpp
|
|
47
|
+
${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
|
|
48
|
+
${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
|
|
49
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
|
|
50
|
+
${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
|
|
51
|
+
${SOURCE_DIR}/ggml-cpu/quants.c
|
|
52
|
+
${SOURCE_DIR}/ggml-cpu/traits.cpp
|
|
53
|
+
${SOURCE_DIR}/ggml-cpu/repack.cpp
|
|
54
|
+
${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
|
|
55
|
+
${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
|
|
56
|
+
${SOURCE_DIR}/ggml-cpu/vec.cpp
|
|
57
|
+
${SOURCE_DIR}/ggml-cpu/ops.cpp
|
|
58
|
+
${SOURCE_DIR}/ggml-metal.m
|
|
59
|
+
${SOURCE_DIR}/ggml-opt.cpp
|
|
60
|
+
${SOURCE_DIR}/ggml-threading.cpp
|
|
61
|
+
${SOURCE_DIR}/ggml-quants.c
|
|
62
|
+
${SOURCE_DIR}/gguf.cpp
|
|
63
|
+
${SOURCE_DIR}/whisper.cpp
|
|
64
|
+
${SOURCE_DIR}/rn-whisper.cpp
|
|
65
|
+
${SOURCE_DIR}/rn-audioutils.cpp
|
|
66
|
+
${SOURCE_FILES_ARCH}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Setup include directories
|
|
70
|
+
target_include_directories(rnwhisper
|
|
71
|
+
PUBLIC
|
|
72
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
|
|
73
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
|
|
74
|
+
$<INSTALL_INTERFACE:include>
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Link required frameworks
|
|
78
|
+
target_link_libraries(rnwhisper PRIVATE
|
|
79
|
+
"-framework Accelerate"
|
|
80
|
+
"-framework Foundation"
|
|
81
|
+
"-framework Metal"
|
|
82
|
+
"-framework MetalKit"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Set properties for framework
|
|
86
|
+
set_target_properties(rnwhisper PROPERTIES
|
|
87
|
+
MACOSX_FRAMEWORK_IDENTIFIER "com.rnwhisper"
|
|
88
|
+
MACOSX_FRAMEWORK_BUNDLE_VERSION 1.0.0
|
|
89
|
+
MACOSX_FRAMEWORK_SHORT_VERSION_STRING 1.0.0
|
|
90
|
+
FRAMEWORK TRUE
|
|
91
|
+
FRAMEWORK_VERSION 1.0.0
|
|
92
|
+
VERSION 1.0.0
|
|
93
|
+
PUBLIC_HEADER "${PUBLIC_HEADERS}"
|
|
94
|
+
XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO
|
|
95
|
+
)
|
package/ios/RNWhisper.h
CHANGED
package/ios/RNWhisper.mm
CHANGED
|
@@ -50,6 +50,7 @@ RCT_REMAP_METHOD(initContext,
|
|
|
50
50
|
BOOL isBundleAsset = [[modelOptions objectForKey:@"isBundleAsset"] boolValue];
|
|
51
51
|
BOOL useGpu = [[modelOptions objectForKey:@"useGpu"] boolValue];
|
|
52
52
|
BOOL useCoreMLIos = [[modelOptions objectForKey:@"useCoreMLIos"] boolValue];
|
|
53
|
+
BOOL useFlashAttn = [[modelOptions objectForKey:@"useFlashAttn"] boolValue];
|
|
53
54
|
|
|
54
55
|
// For support debug assets in development mode
|
|
55
56
|
BOOL downloadCoreMLAssets = [[modelOptions objectForKey:@"downloadCoreMLAssets"] boolValue];
|
|
@@ -79,6 +80,7 @@ RCT_REMAP_METHOD(initContext,
|
|
|
79
80
|
contextId:contextId
|
|
80
81
|
noCoreML:!useCoreMLIos
|
|
81
82
|
noMetal:!useGpu
|
|
83
|
+
useFlashAttn:useFlashAttn
|
|
82
84
|
];
|
|
83
85
|
if ([context getContext] == NULL) {
|
|
84
86
|
reject(@"whisper_cpp_error", @"Failed to load the model", nil);
|
|
@@ -103,42 +105,17 @@ RCT_REMAP_METHOD(initContext,
|
|
|
103
105
|
];
|
|
104
106
|
}
|
|
105
107
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
- (void)transcribeData:(RNWhisperContext *)context
|
|
109
|
+
withContextId:(int)contextId
|
|
110
|
+
withJobId:(int)jobId
|
|
111
|
+
withData:(float *)data
|
|
112
|
+
withDataCount:(int)count
|
|
113
|
+
withOptions:(NSDictionary *)options
|
|
114
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
115
|
+
withRejecter:(RCTPromiseRejectBlock)reject
|
|
113
116
|
{
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
if (context == nil) {
|
|
117
|
-
reject(@"whisper_error", @"Context not found", nil);
|
|
118
|
-
return;
|
|
119
|
-
}
|
|
120
|
-
if ([context isCapturing]) {
|
|
121
|
-
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
122
|
-
return;
|
|
123
|
-
}
|
|
124
|
-
if ([context isTranscribing]) {
|
|
125
|
-
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
126
|
-
return;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
NSString *path = waveFilePath;
|
|
130
|
-
if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
|
|
131
|
-
path = [RNWhisperDownloader downloadFile:path toFile:nil];
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
int count = 0;
|
|
135
|
-
float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
136
|
-
if (waveFile == nil) {
|
|
137
|
-
reject(@"whisper_error", @"Invalid file", nil);
|
|
138
|
-
return;
|
|
139
|
-
}
|
|
140
|
-
[context transcribeFile:jobId
|
|
141
|
-
audioData:waveFile
|
|
117
|
+
[context transcribeData:jobId
|
|
118
|
+
audioData:data
|
|
142
119
|
audioDataCount:count
|
|
143
120
|
options:options
|
|
144
121
|
onProgress: ^(int progress) {
|
|
@@ -171,11 +148,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
171
148
|
}
|
|
172
149
|
onEnd: ^(int code) {
|
|
173
150
|
if (code != 0 && code != 999) {
|
|
174
|
-
free(waveFile);
|
|
175
151
|
reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
|
|
176
152
|
return;
|
|
177
153
|
}
|
|
178
|
-
free(waveFile);
|
|
179
154
|
NSMutableDictionary *result = [context getTextSegments];
|
|
180
155
|
result[@"isAborted"] = @([context isStoppedByAction]);
|
|
181
156
|
resolve(result);
|
|
@@ -183,6 +158,99 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
183
158
|
];
|
|
184
159
|
}
|
|
185
160
|
|
|
161
|
+
RCT_REMAP_METHOD(transcribeFile,
|
|
162
|
+
withContextId:(int)contextId
|
|
163
|
+
withJobId:(int)jobId
|
|
164
|
+
withWaveFile:(NSString *)waveFilePathOrDataBase64
|
|
165
|
+
withOptions:(NSDictionary *)options
|
|
166
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
167
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
168
|
+
{
|
|
169
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
170
|
+
|
|
171
|
+
if (context == nil) {
|
|
172
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
if ([context isCapturing]) {
|
|
176
|
+
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
if ([context isTranscribing]) {
|
|
180
|
+
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
float *data = nil;
|
|
185
|
+
int count = 0;
|
|
186
|
+
if ([waveFilePathOrDataBase64 hasPrefix:@"http://"] || [waveFilePathOrDataBase64 hasPrefix:@"https://"]) {
|
|
187
|
+
NSString *path = [RNWhisperDownloader downloadFile:waveFilePathOrDataBase64 toFile:nil];
|
|
188
|
+
data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
189
|
+
} else if ([waveFilePathOrDataBase64 hasPrefix:@"data:audio/wav;base64,"]) {
|
|
190
|
+
NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[waveFilePathOrDataBase64 substringFromIndex:22] options:0];
|
|
191
|
+
data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
|
|
192
|
+
} else {
|
|
193
|
+
data = [RNWhisperAudioUtils decodeWaveFile:waveFilePathOrDataBase64 count:&count];
|
|
194
|
+
}
|
|
195
|
+
if (data == nil) {
|
|
196
|
+
reject(@"whisper_error", @"Invalid file", nil);
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
[self transcribeData:context
|
|
201
|
+
withContextId:contextId
|
|
202
|
+
withJobId:jobId
|
|
203
|
+
withData:data
|
|
204
|
+
withDataCount:count
|
|
205
|
+
withOptions:options
|
|
206
|
+
withResolver:resolve
|
|
207
|
+
withRejecter:reject
|
|
208
|
+
];
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
RCT_REMAP_METHOD(transcribeData,
|
|
212
|
+
withContextId:(int)contextId
|
|
213
|
+
withJobId:(int)jobId
|
|
214
|
+
withData:(NSString *)dataBase64 // pcm data base64 encoded
|
|
215
|
+
withOptions:(NSDictionary *)options
|
|
216
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
217
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
218
|
+
{
|
|
219
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
220
|
+
|
|
221
|
+
if (context == nil) {
|
|
222
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
if ([context isCapturing]) {
|
|
226
|
+
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
if ([context isTranscribing]) {
|
|
230
|
+
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
231
|
+
return;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:dataBase64 options:0];
|
|
235
|
+
int count = 0;
|
|
236
|
+
float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
|
|
237
|
+
|
|
238
|
+
if (data == nil) {
|
|
239
|
+
reject(@"whisper_error", @"Invalid data", nil);
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
[self transcribeData:context
|
|
244
|
+
withContextId:contextId
|
|
245
|
+
withJobId:jobId
|
|
246
|
+
withData:data
|
|
247
|
+
withDataCount:count
|
|
248
|
+
withOptions:options
|
|
249
|
+
withResolver:resolve
|
|
250
|
+
withRejecter:reject
|
|
251
|
+
];
|
|
252
|
+
}
|
|
253
|
+
|
|
186
254
|
RCT_REMAP_METHOD(startRealtimeTranscribe,
|
|
187
255
|
withContextId:(int)contextId
|
|
188
256
|
withJobId:(int)jobId
|
|
@@ -244,6 +312,25 @@ RCT_REMAP_METHOD(abortTranscribe,
|
|
|
244
312
|
resolve(nil);
|
|
245
313
|
}
|
|
246
314
|
|
|
315
|
+
RCT_REMAP_METHOD(bench,
|
|
316
|
+
withContextId:(int)contextId
|
|
317
|
+
withMaxThreads:(int)maxThreads
|
|
318
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
319
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
320
|
+
{
|
|
321
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
322
|
+
if (context == nil) {
|
|
323
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
if ([context isTranscribing]) {
|
|
327
|
+
reject(@"whisper_error", @"The context is transcribing", nil);
|
|
328
|
+
return;
|
|
329
|
+
}
|
|
330
|
+
NSString *result = [context bench:maxThreads];
|
|
331
|
+
resolve(result);
|
|
332
|
+
}
|
|
333
|
+
|
|
247
334
|
RCT_REMAP_METHOD(releaseContext,
|
|
248
335
|
withContextId:(int)contextId
|
|
249
336
|
withResolver:(RCTPromiseResolveBlock)resolve
|
|
@@ -1,27 +1,38 @@
|
|
|
1
1
|
#import "RNWhisperAudioUtils.h"
|
|
2
|
+
#if RNWHISPER_BUILD_FROM_SOURCE
|
|
2
3
|
#import "whisper.h"
|
|
4
|
+
#else
|
|
5
|
+
#import <rnwhisper/whisper.h>
|
|
6
|
+
#endif
|
|
3
7
|
|
|
4
8
|
@implementation RNWhisperAudioUtils
|
|
5
9
|
|
|
10
|
+
+ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader {
|
|
11
|
+
NSData *waveData = data;
|
|
12
|
+
if (cutHeader) {
|
|
13
|
+
// just cut 44 bytes from the beginning
|
|
14
|
+
waveData = [data subdataWithRange:NSMakeRange(44, [data length]-44)];
|
|
15
|
+
}
|
|
16
|
+
const short *shortArray = (const short *)[waveData bytes];
|
|
17
|
+
int shortCount = (int) ([waveData length] / sizeof(short));
|
|
18
|
+
float *floatArray = (float *) malloc(shortCount * sizeof(float));
|
|
19
|
+
for (NSInteger i = 0; i < shortCount; i++) {
|
|
20
|
+
float floatValue = ((float)shortArray[i]) / 32767.0;
|
|
21
|
+
floatValue = MAX(floatValue, -1.0);
|
|
22
|
+
floatValue = MIN(floatValue, 1.0);
|
|
23
|
+
floatArray[i] = floatValue;
|
|
24
|
+
}
|
|
25
|
+
*count = shortCount;
|
|
26
|
+
return floatArray;
|
|
27
|
+
}
|
|
28
|
+
|
|
6
29
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
|
|
7
30
|
NSURL *url = [NSURL fileURLWithPath:filePath];
|
|
8
31
|
NSData *fileData = [NSData dataWithContentsOfURL:url];
|
|
9
32
|
if (fileData == nil) {
|
|
10
33
|
return nil;
|
|
11
34
|
}
|
|
12
|
-
|
|
13
|
-
[waveData appendData:[fileData subdataWithRange:NSMakeRange(44, [fileData length]-44)]];
|
|
14
|
-
const short *shortArray = (const short *)[waveData bytes];
|
|
15
|
-
int shortCount = (int) ([waveData length] / sizeof(short));
|
|
16
|
-
float *floatArray = (float *) malloc(shortCount * sizeof(float));
|
|
17
|
-
for (NSInteger i = 0; i < shortCount; i++) {
|
|
18
|
-
float floatValue = ((float)shortArray[i]) / 32767.0;
|
|
19
|
-
floatValue = MAX(floatValue, -1.0);
|
|
20
|
-
floatValue = MIN(floatValue, 1.0);
|
|
21
|
-
floatArray[i] = floatValue;
|
|
22
|
-
}
|
|
23
|
-
*count = shortCount;
|
|
24
|
-
return floatArray;
|
|
35
|
+
return [RNWhisperAudioUtils decodeWaveData:fileData count:count cutHeader:YES];
|
|
25
36
|
}
|
|
26
37
|
|
|
27
38
|
@end
|