whisper.rn 0.4.0-rc.1 → 0.4.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +14 -0
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +226 -109
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/cpp/README.md +1 -1
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +5 -3
- package/cpp/ggml-aarch64.c +129 -0
- package/cpp/ggml-aarch64.h +19 -0
- package/cpp/ggml-alloc.c +805 -400
- package/cpp/ggml-alloc.h +60 -10
- package/cpp/ggml-backend-impl.h +216 -0
- package/cpp/ggml-backend-reg.cpp +204 -0
- package/cpp/ggml-backend.cpp +1996 -0
- package/cpp/ggml-backend.cpp.rej +12 -0
- package/cpp/ggml-backend.h +336 -0
- package/cpp/ggml-common.h +1853 -0
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu-aarch64.c +3560 -0
- package/cpp/ggml-cpu-aarch64.h +30 -0
- package/cpp/ggml-cpu-impl.h +371 -0
- package/cpp/ggml-cpu-quants.c +10822 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13970 -0
- package/cpp/ggml-cpu.cpp +663 -0
- package/cpp/ggml-cpu.h +177 -0
- package/cpp/ggml-impl.h +551 -0
- package/cpp/ggml-metal-impl.h +249 -0
- package/cpp/ggml-metal.h +24 -43
- package/cpp/ggml-metal.m +4190 -1075
- package/cpp/ggml-quants.c +5247 -0
- package/cpp/ggml-quants.h +100 -0
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +5474 -18763
- package/cpp/ggml.h +833 -628
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +221 -52
- package/cpp/rn-whisper.h +50 -15
- package/cpp/whisper.cpp +2872 -1371
- package/cpp/whisper.h +170 -41
- package/ios/RNWhisper.mm +139 -46
- package/ios/RNWhisperAudioUtils.h +1 -2
- package/ios/RNWhisperAudioUtils.m +18 -67
- package/ios/RNWhisperContext.h +11 -8
- package/ios/RNWhisperContext.mm +195 -150
- package/jest/mock.js +15 -2
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +76 -28
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +76 -28
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +13 -4
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +37 -5
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +9 -7
- package/src/NativeRNWhisper.ts +20 -4
- package/src/index.ts +98 -42
- package/src/version.json +1 -1
- package/whisper-rn.podspec +11 -18
- package/cpp/ggml-metal.metal +0 -2353
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -1,30 +1,97 @@
|
|
|
1
1
|
#import "RNWhisperContext.h"
|
|
2
|
-
#import
|
|
2
|
+
#import <Metal/Metal.h>
|
|
3
3
|
#include <vector>
|
|
4
4
|
|
|
5
5
|
#define NUM_BYTES_PER_BUFFER 16 * 1024
|
|
6
6
|
|
|
7
7
|
@implementation RNWhisperContext
|
|
8
8
|
|
|
9
|
-
+ (instancetype)initWithModelPath:(NSString *)modelPath
|
|
9
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath
|
|
10
|
+
contextId:(int)contextId
|
|
11
|
+
noCoreML:(BOOL)noCoreML
|
|
12
|
+
noMetal:(BOOL)noMetal
|
|
13
|
+
useFlashAttn:(BOOL)useFlashAttn
|
|
14
|
+
{
|
|
10
15
|
RNWhisperContext *context = [[RNWhisperContext alloc] init];
|
|
11
16
|
context->contextId = contextId;
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
struct whisper_context_params cparams;
|
|
18
|
+
NSString *reasonNoMetal = @"";
|
|
19
|
+
cparams.use_gpu = !noMetal;
|
|
20
|
+
cparams.flash_attn = useFlashAttn;
|
|
21
|
+
|
|
22
|
+
// TODO: Expose dtw_token_timestamps and dtw_aheads_preset
|
|
23
|
+
cparams.dtw_token_timestamps = false;
|
|
24
|
+
// cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
|
|
25
|
+
|
|
26
|
+
cparams.use_coreml = !noCoreML;
|
|
27
|
+
#ifndef WHISPER_USE_COREML
|
|
28
|
+
if (cparams.use_coreml) {
|
|
29
|
+
NSLog(@"[RNWhisper] CoreML is not enabled in this build, ignoring use_coreml option");
|
|
30
|
+
cparams.use_coreml = false;
|
|
31
|
+
}
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
#ifndef WSP_GGML_USE_METAL
|
|
35
|
+
if (cparams.use_gpu) {
|
|
36
|
+
NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
|
|
37
|
+
cparams.use_gpu = false;
|
|
17
38
|
}
|
|
18
|
-
#else
|
|
19
|
-
context->ctx = whisper_init_from_file([modelPath UTF8String]);
|
|
20
39
|
#endif
|
|
40
|
+
|
|
41
|
+
#ifdef WSP_GGML_USE_METAL
|
|
42
|
+
if (cparams.use_gpu) {
|
|
43
|
+
#if TARGET_OS_SIMULATOR
|
|
44
|
+
NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
|
|
45
|
+
cparams.use_gpu = false;
|
|
46
|
+
#else // TARGET_OS_SIMULATOR
|
|
47
|
+
// Check ggml-metal availability
|
|
48
|
+
NSError * error = nil;
|
|
49
|
+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
50
|
+
id<MTLLibrary> library = [device
|
|
51
|
+
newLibraryWithSource:@"#include <metal_stdlib>\n"
|
|
52
|
+
"using namespace metal;"
|
|
53
|
+
"kernel void test() { simd_sum(0); }"
|
|
54
|
+
options:nil
|
|
55
|
+
error:&error
|
|
56
|
+
];
|
|
57
|
+
if (error) {
|
|
58
|
+
reasonNoMetal = [error localizedDescription];
|
|
59
|
+
} else {
|
|
60
|
+
id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
|
|
61
|
+
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
|
|
62
|
+
if (pipeline == nil) {
|
|
63
|
+
reasonNoMetal = [error localizedDescription];
|
|
64
|
+
NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
|
|
65
|
+
cparams.use_gpu = false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
#endif // TARGET_OS_SIMULATOR
|
|
69
|
+
}
|
|
70
|
+
#endif // WSP_GGML_USE_METAL
|
|
71
|
+
|
|
72
|
+
if (cparams.use_gpu && cparams.use_coreml) {
|
|
73
|
+
NSLog(@"[RNWhisper] Both use_gpu and use_coreml are enabled, ignoring use_coreml option");
|
|
74
|
+
cparams.use_coreml = false; // Skip CoreML if Metal is enabled
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
context->ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
|
|
21
78
|
context->dQueue = dispatch_queue_create(
|
|
22
79
|
[[NSString stringWithFormat:@"RNWhisperContext-%d", contextId] UTF8String],
|
|
23
80
|
DISPATCH_QUEUE_SERIAL
|
|
24
81
|
);
|
|
82
|
+
context->isMetalEnabled = cparams.use_gpu;
|
|
83
|
+
context->reasonNoMetal = reasonNoMetal;
|
|
25
84
|
return context;
|
|
26
85
|
}
|
|
27
86
|
|
|
87
|
+
- (bool)isMetalEnabled {
|
|
88
|
+
return isMetalEnabled;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
- (NSString *)reasonNoMetal {
|
|
92
|
+
return reasonNoMetal;
|
|
93
|
+
}
|
|
94
|
+
|
|
28
95
|
- (struct whisper_context *)getContext {
|
|
29
96
|
return self->ctx;
|
|
30
97
|
}
|
|
@@ -33,7 +100,7 @@
|
|
|
33
100
|
return self->dQueue;
|
|
34
101
|
}
|
|
35
102
|
|
|
36
|
-
- (void)prepareRealtime:(NSDictionary *)options {
|
|
103
|
+
- (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
|
|
37
104
|
self->recordState.options = options;
|
|
38
105
|
|
|
39
106
|
self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
|
|
@@ -46,68 +113,40 @@
|
|
|
46
113
|
self->recordState.dataFormat.mReserved = 0;
|
|
47
114
|
self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
48
115
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
self->recordState.
|
|
52
|
-
|
|
53
|
-
int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
|
|
54
|
-
int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
|
|
55
|
-
|
|
56
|
-
self->recordState.audioSliceSec = audioSliceSec;
|
|
57
|
-
self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
|
|
116
|
+
self->recordState.isRealtime = true;
|
|
117
|
+
self->recordState.isTranscribing = false;
|
|
118
|
+
self->recordState.isCapturing = false;
|
|
119
|
+
self->recordState.isStoppedByAction = false;
|
|
58
120
|
|
|
59
121
|
self->recordState.sliceIndex = 0;
|
|
60
122
|
self->recordState.transcribeSliceIndex = 0;
|
|
61
123
|
self->recordState.nSamplesTranscribing = 0;
|
|
62
124
|
|
|
63
|
-
|
|
64
|
-
self->recordState.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
125
|
+
self->recordState.sliceNSamples.clear();
|
|
126
|
+
self->recordState.sliceNSamples.push_back(0);
|
|
127
|
+
|
|
128
|
+
self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
|
|
129
|
+
self->recordState.job->set_realtime_params(
|
|
130
|
+
{
|
|
131
|
+
.use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
|
|
132
|
+
.vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
|
|
133
|
+
.vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
|
|
134
|
+
.freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
|
|
135
|
+
},
|
|
136
|
+
options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
|
|
137
|
+
options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
|
|
138
|
+
options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
|
|
139
|
+
options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
|
|
140
|
+
);
|
|
141
|
+
self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
|
|
76
142
|
|
|
77
143
|
self->recordState.mSelf = self;
|
|
78
144
|
}
|
|
79
145
|
|
|
80
|
-
|
|
81
|
-
if (self->recordState.shortBufferSlices != nil) {
|
|
82
|
-
for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
|
|
83
|
-
int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
|
|
84
|
-
free(audioBufferI16);
|
|
85
|
-
}
|
|
86
|
-
self->recordState.shortBufferSlices = nil;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
|
|
146
|
+
bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
|
|
91
147
|
{
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
|
|
95
|
-
int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
|
|
96
|
-
if (nSamples + n > sampleSize) {
|
|
97
|
-
int start = nSamples + n - sampleSize;
|
|
98
|
-
std::vector<float> audioBufferF32Vec(sampleSize);
|
|
99
|
-
for (int i = 0; i < sampleSize; i++) {
|
|
100
|
-
audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
|
|
101
|
-
}
|
|
102
|
-
float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
|
|
103
|
-
float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
104
|
-
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
|
|
105
|
-
NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
|
|
106
|
-
} else {
|
|
107
|
-
isSpeech = false;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
return isSpeech;
|
|
148
|
+
if (state->isTranscribing) return true;
|
|
149
|
+
return state->job->vad_simple(sliceIndex, nSamples, n);
|
|
111
150
|
}
|
|
112
151
|
|
|
113
152
|
void AudioInputCallback(void * inUserData,
|
|
@@ -122,21 +161,21 @@ void AudioInputCallback(void * inUserData,
|
|
|
122
161
|
if (!state->isCapturing) {
|
|
123
162
|
NSLog(@"[RNWhisper] Not capturing, ignoring audio");
|
|
124
163
|
if (!state->isTranscribing) {
|
|
125
|
-
state->
|
|
164
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
126
165
|
}
|
|
127
166
|
return;
|
|
128
167
|
}
|
|
129
168
|
|
|
130
169
|
int totalNSamples = 0;
|
|
131
|
-
for (int i = 0; i <
|
|
132
|
-
totalNSamples +=
|
|
170
|
+
for (int i = 0; i < state->sliceNSamples.size(); i++) {
|
|
171
|
+
totalNSamples += state->sliceNSamples[i];
|
|
133
172
|
}
|
|
134
173
|
|
|
135
174
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
136
175
|
|
|
137
|
-
int nSamples =
|
|
176
|
+
int nSamples = state->sliceNSamples[state->sliceIndex];
|
|
138
177
|
|
|
139
|
-
if (totalNSamples + n > state->
|
|
178
|
+
if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
|
|
140
179
|
NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
|
|
141
180
|
state->isCapturing = false;
|
|
142
181
|
[state->mSelf stopAudio];
|
|
@@ -145,14 +184,14 @@ void AudioInputCallback(void * inUserData,
|
|
|
145
184
|
nSamples == state->nSamplesTranscribing &&
|
|
146
185
|
state->sliceIndex == state->transcribeSliceIndex
|
|
147
186
|
) {
|
|
148
|
-
state->
|
|
187
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
149
188
|
} else if (
|
|
150
189
|
!state->isTranscribing &&
|
|
151
190
|
nSamples != state->nSamplesTranscribing
|
|
152
191
|
) {
|
|
153
|
-
|
|
154
|
-
if (!vad(state,
|
|
155
|
-
state->
|
|
192
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
193
|
+
if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
|
|
194
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
156
195
|
return;
|
|
157
196
|
}
|
|
158
197
|
state->isTranscribing = true;
|
|
@@ -163,31 +202,25 @@ void AudioInputCallback(void * inUserData,
|
|
|
163
202
|
return;
|
|
164
203
|
}
|
|
165
204
|
|
|
166
|
-
|
|
167
|
-
if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
|
|
205
|
+
if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
|
|
168
206
|
// next slice
|
|
169
207
|
state->sliceIndex++;
|
|
170
208
|
nSamples = 0;
|
|
171
|
-
|
|
172
|
-
[state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
|
|
173
|
-
[state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
|
|
209
|
+
state->sliceNSamples.push_back(0);
|
|
174
210
|
}
|
|
175
211
|
|
|
176
|
-
|
|
177
|
-
NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
|
|
212
|
+
NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
|
|
178
213
|
|
|
179
|
-
|
|
180
|
-
for (int i = 0; i < n; i++) {
|
|
181
|
-
audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
182
|
-
}
|
|
214
|
+
state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
|
|
183
215
|
|
|
184
|
-
bool isSpeech = vad(state,
|
|
216
|
+
bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
|
|
185
217
|
nSamples += n;
|
|
186
|
-
state->sliceNSamples[state->sliceIndex] =
|
|
218
|
+
state->sliceNSamples[state->sliceIndex] = nSamples;
|
|
187
219
|
|
|
188
220
|
AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
|
|
189
221
|
|
|
190
|
-
|
|
222
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
223
|
+
if (!isSamplesEnough || !isSpeech) return;
|
|
191
224
|
|
|
192
225
|
if (!state->isTranscribing) {
|
|
193
226
|
state->isTranscribing = true;
|
|
@@ -197,21 +230,29 @@ void AudioInputCallback(void * inUserData,
|
|
|
197
230
|
}
|
|
198
231
|
}
|
|
199
232
|
|
|
233
|
+
- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
|
|
234
|
+
// Save wav if needed
|
|
235
|
+
if (state->job->audio_output_path != nullptr) {
|
|
236
|
+
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
237
|
+
rnaudioutils::save_wav_file(
|
|
238
|
+
rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
|
|
239
|
+
state->job->audio_output_path
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
state->transcribeHandler(state->job->job_id, @"end", result);
|
|
243
|
+
rnwhisper::job_remove(state->job->job_id);
|
|
244
|
+
}
|
|
245
|
+
|
|
200
246
|
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
201
|
-
int nSamplesOfIndex =
|
|
247
|
+
int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
202
248
|
state->nSamplesTranscribing = nSamplesOfIndex;
|
|
203
249
|
NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
|
|
204
250
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
// convert I16 to F32
|
|
208
|
-
for (int i = 0; i < state->nSamplesTranscribing; i++) {
|
|
209
|
-
audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
|
|
210
|
-
}
|
|
251
|
+
float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
|
|
252
|
+
|
|
211
253
|
CFTimeInterval timeStart = CACurrentMediaTime();
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
free(audioBufferF32);
|
|
254
|
+
int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
|
|
255
|
+
free(pcmf32);
|
|
215
256
|
CFTimeInterval timeEnd = CACurrentMediaTime();
|
|
216
257
|
const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
|
|
217
258
|
|
|
@@ -231,7 +272,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
231
272
|
result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
|
|
232
273
|
}
|
|
233
274
|
|
|
234
|
-
nSamplesOfIndex =
|
|
275
|
+
nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
235
276
|
|
|
236
277
|
bool isStopped = state->isStoppedByAction || (
|
|
237
278
|
!state->isCapturing &&
|
|
@@ -256,23 +297,13 @@ void AudioInputCallback(void * inUserData,
|
|
|
256
297
|
result[@"isStoppedByAction"] = @(state->isStoppedByAction);
|
|
257
298
|
result[@"isCapturing"] = @(false);
|
|
258
299
|
|
|
259
|
-
|
|
260
|
-
if (state->options[@"audioOutputPath"] != nil) {
|
|
261
|
-
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
262
|
-
[RNWhisperAudioUtils
|
|
263
|
-
saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
|
|
264
|
-
sliceNSamples:state->sliceNSamples]
|
|
265
|
-
audioOutputFile:state->options[@"audioOutputPath"]
|
|
266
|
-
];
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
state->transcribeHandler(state->jobId, @"end", result);
|
|
300
|
+
[state->mSelf finishRealtimeTranscribe:state result:result];
|
|
270
301
|
} else if (code == 0) {
|
|
271
302
|
result[@"isCapturing"] = @(true);
|
|
272
|
-
state->transcribeHandler(state->
|
|
303
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
273
304
|
} else {
|
|
274
305
|
result[@"isCapturing"] = @(true);
|
|
275
|
-
state->transcribeHandler(state->
|
|
306
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
276
307
|
}
|
|
277
308
|
|
|
278
309
|
if (continueNeeded) {
|
|
@@ -300,8 +331,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
300
331
|
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
|
|
301
332
|
{
|
|
302
333
|
self->recordState.transcribeHandler = onTranscribe;
|
|
303
|
-
self
|
|
304
|
-
[self prepareRealtime:options];
|
|
334
|
+
[self prepareRealtime:jobId options:options];
|
|
305
335
|
|
|
306
336
|
OSStatus status = AudioQueueNewInput(
|
|
307
337
|
&self->recordState.dataFormat,
|
|
@@ -329,9 +359,10 @@ void AudioInputCallback(void * inUserData,
|
|
|
329
359
|
struct rnwhisper_segments_callback_data {
|
|
330
360
|
void (^onNewSegments)(NSDictionary *);
|
|
331
361
|
int total_n_new;
|
|
362
|
+
bool tdrzEnable;
|
|
332
363
|
};
|
|
333
364
|
|
|
334
|
-
- (void)
|
|
365
|
+
- (void)transcribeData:(int)jobId
|
|
335
366
|
audioData:(float *)audioData
|
|
336
367
|
audioDataCount:(int)audioDataCount
|
|
337
368
|
options:(NSDictionary *)options
|
|
@@ -342,9 +373,9 @@ struct rnwhisper_segments_callback_data {
|
|
|
342
373
|
dispatch_async(dQueue, ^{
|
|
343
374
|
self->recordState.isStoppedByAction = false;
|
|
344
375
|
self->recordState.isTranscribing = true;
|
|
345
|
-
self->recordState.jobId = jobId;
|
|
346
376
|
|
|
347
|
-
whisper_full_params params = [self
|
|
377
|
+
whisper_full_params params = [self createParams:options jobId:jobId];
|
|
378
|
+
|
|
348
379
|
if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
|
|
349
380
|
params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
|
350
381
|
void (^onProgress)(int) = (__bridge void (^)(int))user_data;
|
|
@@ -362,12 +393,18 @@ struct rnwhisper_segments_callback_data {
|
|
|
362
393
|
NSMutableArray *segments = [[NSMutableArray alloc] init];
|
|
363
394
|
for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
|
|
364
395
|
const char * text_cur = whisper_full_get_segment_text(ctx, i);
|
|
365
|
-
|
|
396
|
+
NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
|
|
397
|
+
|
|
398
|
+
if (data->tdrzEnable && whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
|
399
|
+
[mutable_ns_text appendString:@" [SPEAKER_TURN]"];
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
text = [text stringByAppendingString:mutable_ns_text];
|
|
366
403
|
|
|
367
404
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
368
405
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
369
406
|
NSDictionary *segment = @{
|
|
370
|
-
@"text": [NSString
|
|
407
|
+
@"text": [NSString stringWithString:mutable_ns_text],
|
|
371
408
|
@"t0": [NSNumber numberWithLongLong:t0],
|
|
372
409
|
@"t1": [NSNumber numberWithLongLong:t1]
|
|
373
410
|
};
|
|
@@ -385,12 +422,16 @@ struct rnwhisper_segments_callback_data {
|
|
|
385
422
|
};
|
|
386
423
|
struct rnwhisper_segments_callback_data user_data = {
|
|
387
424
|
.onNewSegments = onNewSegments,
|
|
388
|
-
.
|
|
425
|
+
.tdrzEnable = options[@"tdrzEnable"] && [options[@"tdrzEnable"] boolValue],
|
|
426
|
+
.total_n_new = 0,
|
|
389
427
|
};
|
|
390
428
|
params.new_segment_callback_user_data = &user_data;
|
|
391
429
|
}
|
|
392
|
-
|
|
393
|
-
|
|
430
|
+
|
|
431
|
+
rnwhisper::job* job = rnwhisper::job_new(jobId, params);
|
|
432
|
+
self->recordState.job = job;
|
|
433
|
+
int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
|
|
434
|
+
rnwhisper::job_remove(jobId);
|
|
394
435
|
self->recordState.isTranscribing = false;
|
|
395
436
|
onEnd(code);
|
|
396
437
|
});
|
|
@@ -405,9 +446,13 @@ struct rnwhisper_segments_callback_data {
|
|
|
405
446
|
}
|
|
406
447
|
|
|
407
448
|
- (void)stopTranscribe:(int)jobId {
|
|
408
|
-
|
|
449
|
+
if (self->recordState.job) self->recordState.job->abort();
|
|
409
450
|
if (self->recordState.isRealtime && self->recordState.isCapturing) {
|
|
410
451
|
[self stopAudio];
|
|
452
|
+
if (!self->recordState.isTranscribing) {
|
|
453
|
+
// Handle for VAD case
|
|
454
|
+
self->recordState.transcribeHandler(jobId, @"end", @{});
|
|
455
|
+
}
|
|
411
456
|
}
|
|
412
457
|
self->recordState.isCapturing = false;
|
|
413
458
|
self->recordState.isStoppedByAction = true;
|
|
@@ -415,13 +460,11 @@ struct rnwhisper_segments_callback_data {
|
|
|
415
460
|
}
|
|
416
461
|
|
|
417
462
|
- (void)stopCurrentTranscribe {
|
|
418
|
-
if (
|
|
419
|
-
|
|
420
|
-
}
|
|
421
|
-
[self stopTranscribe:self->recordState.jobId];
|
|
463
|
+
if (self->recordState.job == nullptr) return;
|
|
464
|
+
[self stopTranscribe:self->recordState.job->job_id];
|
|
422
465
|
}
|
|
423
466
|
|
|
424
|
-
- (struct whisper_full_params)
|
|
467
|
+
- (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
|
|
425
468
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
426
469
|
|
|
427
470
|
const int n_threads = options[@"maxThreads"] != nil ?
|
|
@@ -440,9 +483,8 @@ struct rnwhisper_segments_callback_data {
|
|
|
440
483
|
params.print_progress = false;
|
|
441
484
|
params.print_timestamps = false;
|
|
442
485
|
params.print_special = false;
|
|
443
|
-
params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
|
|
444
486
|
params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
|
|
445
|
-
params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
|
|
487
|
+
params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
|
|
446
488
|
params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
|
447
489
|
params.offset_ms = 0;
|
|
448
490
|
params.no_context = true;
|
|
@@ -452,6 +494,7 @@ struct rnwhisper_segments_callback_data {
|
|
|
452
494
|
params.max_len = [options[@"maxLen"] intValue];
|
|
453
495
|
}
|
|
454
496
|
params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
|
|
497
|
+
params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false;
|
|
455
498
|
|
|
456
499
|
if (options[@"bestOf"] != nil) {
|
|
457
500
|
params.greedy.best_of = [options[@"bestOf"] intValue];
|
|
@@ -459,7 +502,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
459
502
|
if (options[@"maxContext"] != nil) {
|
|
460
503
|
params.n_max_text_ctx = [options[@"maxContext"] intValue];
|
|
461
504
|
}
|
|
462
|
-
|
|
463
505
|
if (options[@"offset"] != nil) {
|
|
464
506
|
params.offset_ms = [options[@"offset"] intValue];
|
|
465
507
|
}
|
|
@@ -475,35 +517,20 @@ struct rnwhisper_segments_callback_data {
|
|
|
475
517
|
if (options[@"temperatureInc"] != nil) {
|
|
476
518
|
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
477
519
|
}
|
|
478
|
-
|
|
479
520
|
if (options[@"prompt"] != nil) {
|
|
480
|
-
params.initial_prompt = [options[@"prompt"] UTF8String];
|
|
521
|
+
params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
|
|
481
522
|
}
|
|
482
523
|
|
|
483
|
-
// abort handler
|
|
484
|
-
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
485
|
-
bool is_aborted = *(bool*)user_data;
|
|
486
|
-
return !is_aborted;
|
|
487
|
-
};
|
|
488
|
-
params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(jobId);
|
|
489
|
-
params.abort_callback = [](void * user_data) {
|
|
490
|
-
bool is_aborted = *(bool*)user_data;
|
|
491
|
-
return is_aborted;
|
|
492
|
-
};
|
|
493
|
-
params.abort_callback_user_data = rn_whisper_assign_abort_map(jobId);
|
|
494
|
-
|
|
495
524
|
return params;
|
|
496
525
|
}
|
|
497
526
|
|
|
498
|
-
- (int)fullTranscribe:(
|
|
499
|
-
params:(struct whisper_full_params)params
|
|
527
|
+
- (int)fullTranscribe:(rnwhisper::job *)job
|
|
500
528
|
audioData:(float *)audioData
|
|
501
529
|
audioDataCount:(int)audioDataCount
|
|
502
530
|
{
|
|
503
531
|
whisper_reset_timings(self->ctx);
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
rn_whisper_remove_abort_map(jobId);
|
|
532
|
+
int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
|
|
533
|
+
if (job && job->is_aborted()) code = -999;
|
|
507
534
|
// if (code == 0) {
|
|
508
535
|
// whisper_print_timings(self->ctx);
|
|
509
536
|
// }
|
|
@@ -517,12 +544,21 @@ struct rnwhisper_segments_callback_data {
|
|
|
517
544
|
NSMutableArray *segments = [[NSMutableArray alloc] init];
|
|
518
545
|
for (int i = 0; i < n_segments; i++) {
|
|
519
546
|
const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
|
|
520
|
-
|
|
547
|
+
NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
|
|
548
|
+
|
|
549
|
+
// Simplified condition
|
|
550
|
+
if (self->recordState.options[@"tdrzEnable"] &&
|
|
551
|
+
[self->recordState.options[@"tdrzEnable"] boolValue] &&
|
|
552
|
+
whisper_full_get_segment_speaker_turn_next(self->ctx, i)) {
|
|
553
|
+
[mutable_ns_text appendString:@" [SPEAKER_TURN]"];
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
text = [text stringByAppendingString:mutable_ns_text];
|
|
521
557
|
|
|
522
558
|
const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
|
|
523
559
|
const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
|
|
524
560
|
NSDictionary *segment = @{
|
|
525
|
-
@"text": [NSString
|
|
561
|
+
@"text": [NSString stringWithString:mutable_ns_text],
|
|
526
562
|
@"t0": [NSNumber numberWithLongLong:t0],
|
|
527
563
|
@"t1": [NSNumber numberWithLongLong:t1]
|
|
528
564
|
};
|
|
@@ -534,10 +570,19 @@ struct rnwhisper_segments_callback_data {
|
|
|
534
570
|
return result;
|
|
535
571
|
}
|
|
536
572
|
|
|
573
|
+
- (NSString *)bench:(int)maxThreads {
|
|
574
|
+
const int n_threads = maxThreads > 0 ? maxThreads : 0;
|
|
575
|
+
|
|
576
|
+
const int max_threads = (int) [[NSProcessInfo processInfo] processorCount];
|
|
577
|
+
// Use 2 threads by default on 4-core devices, 4 threads on more cores
|
|
578
|
+
const int default_n_threads = max_threads == 4 ? 2 : MIN(4, max_threads);
|
|
579
|
+
NSString *result = [NSString stringWithUTF8String:rnwhisper::bench(self->ctx, n_threads).c_str()];
|
|
580
|
+
return result;
|
|
581
|
+
}
|
|
582
|
+
|
|
537
583
|
- (void)invalidate {
|
|
538
584
|
[self stopCurrentTranscribe];
|
|
539
585
|
whisper_free(self->ctx);
|
|
540
|
-
[self freeBufferIfNeeded];
|
|
541
586
|
}
|
|
542
587
|
|
|
543
588
|
@end
|
package/jest/mock.js
CHANGED
|
@@ -2,12 +2,17 @@ const { NativeModules, DeviceEventEmitter } = require('react-native')
|
|
|
2
2
|
|
|
3
3
|
if (!NativeModules.RNWhisper) {
|
|
4
4
|
NativeModules.RNWhisper = {
|
|
5
|
-
initContext: jest.fn(() => Promise.resolve(1)),
|
|
5
|
+
initContext: jest.fn(() => Promise.resolve({ contextId: 1 })),
|
|
6
6
|
transcribeFile: jest.fn(() => Promise.resolve({
|
|
7
7
|
result: ' Test',
|
|
8
8
|
segments: [{ text: ' Test', t0: 0, t1: 33 }],
|
|
9
9
|
isAborted: false,
|
|
10
10
|
})),
|
|
11
|
+
transcribeData: jest.fn(() => Promise.resolve({
|
|
12
|
+
result: ' Test',
|
|
13
|
+
segments: [{ text: ' Test', t0: 0, t1: 33 }],
|
|
14
|
+
isAborted: false,
|
|
15
|
+
})),
|
|
11
16
|
startRealtimeTranscribe: jest.fn((contextId, jobId) => {
|
|
12
17
|
setTimeout(() => {
|
|
13
18
|
// Start
|
|
@@ -45,11 +50,19 @@ if (!NativeModules.RNWhisper) {
|
|
|
45
50
|
})
|
|
46
51
|
})
|
|
47
52
|
}),
|
|
53
|
+
bench: jest.fn(() => Promise.resolve({
|
|
54
|
+
config: 'NEON',
|
|
55
|
+
nThreads: 1,
|
|
56
|
+
encodeMs: 1,
|
|
57
|
+
decodeMs: 1,
|
|
58
|
+
batchMs: 1,
|
|
59
|
+
promptMs: 1,
|
|
60
|
+
})),
|
|
48
61
|
releaseContext: jest.fn(() => Promise.resolve()),
|
|
49
62
|
releaseAllContexts: jest.fn(() => Promise.resolve()),
|
|
50
63
|
|
|
51
64
|
// iOS AudioSession utils
|
|
52
|
-
getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
|
|
65
|
+
getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
|
|
53
66
|
category: 'AVAudioSessionCategoryPlayAndRecord',
|
|
54
67
|
options: [],
|
|
55
68
|
})),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,
|
|
1
|
+
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0GnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
|