whisper.rn 0.4.0-rc.0 → 0.4.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +14 -0
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +226 -109
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/cpp/README.md +1 -1
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +5 -3
- package/cpp/ggml-aarch64.c +129 -0
- package/cpp/ggml-aarch64.h +19 -0
- package/cpp/ggml-alloc.c +805 -400
- package/cpp/ggml-alloc.h +60 -10
- package/cpp/ggml-backend-impl.h +216 -0
- package/cpp/ggml-backend-reg.cpp +204 -0
- package/cpp/ggml-backend.cpp +1996 -0
- package/cpp/ggml-backend.cpp.rej +12 -0
- package/cpp/ggml-backend.h +336 -0
- package/cpp/ggml-common.h +1853 -0
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu-aarch64.c +3560 -0
- package/cpp/ggml-cpu-aarch64.h +30 -0
- package/cpp/ggml-cpu-impl.h +371 -0
- package/cpp/ggml-cpu-quants.c +10822 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13970 -0
- package/cpp/ggml-cpu.cpp +663 -0
- package/cpp/ggml-cpu.h +177 -0
- package/cpp/ggml-impl.h +551 -0
- package/cpp/ggml-metal-impl.h +249 -0
- package/cpp/ggml-metal.h +24 -43
- package/cpp/ggml-metal.m +4190 -1075
- package/cpp/ggml-quants.c +5247 -0
- package/cpp/ggml-quants.h +100 -0
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +5474 -18763
- package/cpp/ggml.h +833 -628
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +221 -52
- package/cpp/rn-whisper.h +50 -15
- package/cpp/whisper.cpp +2863 -1340
- package/cpp/whisper.h +170 -38
- package/ios/RNWhisper.mm +141 -46
- package/ios/RNWhisperAudioUtils.h +1 -2
- package/ios/RNWhisperAudioUtils.m +18 -67
- package/ios/RNWhisperContext.h +11 -8
- package/ios/RNWhisperContext.mm +197 -144
- package/jest/mock.js +15 -2
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +78 -28
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +78 -28
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +14 -4
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +39 -5
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +9 -7
- package/src/NativeRNWhisper.ts +21 -4
- package/src/index.ts +102 -42
- package/src/version.json +1 -1
- package/whisper-rn.podspec +11 -18
- package/cpp/ggml-metal.metal +0 -2353
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -1,22 +1,97 @@
|
|
|
1
1
|
#import "RNWhisperContext.h"
|
|
2
|
-
#import
|
|
2
|
+
#import <Metal/Metal.h>
|
|
3
3
|
#include <vector>
|
|
4
4
|
|
|
5
5
|
#define NUM_BYTES_PER_BUFFER 16 * 1024
|
|
6
6
|
|
|
7
7
|
@implementation RNWhisperContext
|
|
8
8
|
|
|
9
|
-
+ (instancetype)initWithModelPath:(NSString *)modelPath
|
|
9
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath
|
|
10
|
+
contextId:(int)contextId
|
|
11
|
+
noCoreML:(BOOL)noCoreML
|
|
12
|
+
noMetal:(BOOL)noMetal
|
|
13
|
+
useFlashAttn:(BOOL)useFlashAttn
|
|
14
|
+
{
|
|
10
15
|
RNWhisperContext *context = [[RNWhisperContext alloc] init];
|
|
11
16
|
context->contextId = contextId;
|
|
12
|
-
|
|
17
|
+
struct whisper_context_params cparams;
|
|
18
|
+
NSString *reasonNoMetal = @"";
|
|
19
|
+
cparams.use_gpu = !noMetal;
|
|
20
|
+
cparams.flash_attn = useFlashAttn;
|
|
21
|
+
|
|
22
|
+
// TODO: Expose dtw_token_timestamps and dtw_aheads_preset
|
|
23
|
+
cparams.dtw_token_timestamps = false;
|
|
24
|
+
// cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
|
|
25
|
+
|
|
26
|
+
cparams.use_coreml = !noCoreML;
|
|
27
|
+
#ifndef WHISPER_USE_COREML
|
|
28
|
+
if (cparams.use_coreml) {
|
|
29
|
+
NSLog(@"[RNWhisper] CoreML is not enabled in this build, ignoring use_coreml option");
|
|
30
|
+
cparams.use_coreml = false;
|
|
31
|
+
}
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
#ifndef WSP_GGML_USE_METAL
|
|
35
|
+
if (cparams.use_gpu) {
|
|
36
|
+
NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
|
|
37
|
+
cparams.use_gpu = false;
|
|
38
|
+
}
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
#ifdef WSP_GGML_USE_METAL
|
|
42
|
+
if (cparams.use_gpu) {
|
|
43
|
+
#if TARGET_OS_SIMULATOR
|
|
44
|
+
NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
|
|
45
|
+
cparams.use_gpu = false;
|
|
46
|
+
#else // TARGET_OS_SIMULATOR
|
|
47
|
+
// Check ggml-metal availability
|
|
48
|
+
NSError * error = nil;
|
|
49
|
+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
50
|
+
id<MTLLibrary> library = [device
|
|
51
|
+
newLibraryWithSource:@"#include <metal_stdlib>\n"
|
|
52
|
+
"using namespace metal;"
|
|
53
|
+
"kernel void test() { simd_sum(0); }"
|
|
54
|
+
options:nil
|
|
55
|
+
error:&error
|
|
56
|
+
];
|
|
57
|
+
if (error) {
|
|
58
|
+
reasonNoMetal = [error localizedDescription];
|
|
59
|
+
} else {
|
|
60
|
+
id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
|
|
61
|
+
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
|
|
62
|
+
if (pipeline == nil) {
|
|
63
|
+
reasonNoMetal = [error localizedDescription];
|
|
64
|
+
NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
|
|
65
|
+
cparams.use_gpu = false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
#endif // TARGET_OS_SIMULATOR
|
|
69
|
+
}
|
|
70
|
+
#endif // WSP_GGML_USE_METAL
|
|
71
|
+
|
|
72
|
+
if (cparams.use_gpu && cparams.use_coreml) {
|
|
73
|
+
NSLog(@"[RNWhisper] Both use_gpu and use_coreml are enabled, ignoring use_coreml option");
|
|
74
|
+
cparams.use_coreml = false; // Skip CoreML if Metal is enabled
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
context->ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
|
|
13
78
|
context->dQueue = dispatch_queue_create(
|
|
14
79
|
[[NSString stringWithFormat:@"RNWhisperContext-%d", contextId] UTF8String],
|
|
15
80
|
DISPATCH_QUEUE_SERIAL
|
|
16
81
|
);
|
|
82
|
+
context->isMetalEnabled = cparams.use_gpu;
|
|
83
|
+
context->reasonNoMetal = reasonNoMetal;
|
|
17
84
|
return context;
|
|
18
85
|
}
|
|
19
86
|
|
|
87
|
+
- (bool)isMetalEnabled {
|
|
88
|
+
return isMetalEnabled;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
- (NSString *)reasonNoMetal {
|
|
92
|
+
return reasonNoMetal;
|
|
93
|
+
}
|
|
94
|
+
|
|
20
95
|
- (struct whisper_context *)getContext {
|
|
21
96
|
return self->ctx;
|
|
22
97
|
}
|
|
@@ -25,7 +100,7 @@
|
|
|
25
100
|
return self->dQueue;
|
|
26
101
|
}
|
|
27
102
|
|
|
28
|
-
- (void)prepareRealtime:(NSDictionary *)options {
|
|
103
|
+
- (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
|
|
29
104
|
self->recordState.options = options;
|
|
30
105
|
|
|
31
106
|
self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
|
|
@@ -38,68 +113,40 @@
|
|
|
38
113
|
self->recordState.dataFormat.mReserved = 0;
|
|
39
114
|
self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
40
115
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
self->recordState.
|
|
44
|
-
|
|
45
|
-
int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
|
|
46
|
-
int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
|
|
47
|
-
|
|
48
|
-
self->recordState.audioSliceSec = audioSliceSec;
|
|
49
|
-
self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
|
|
116
|
+
self->recordState.isRealtime = true;
|
|
117
|
+
self->recordState.isTranscribing = false;
|
|
118
|
+
self->recordState.isCapturing = false;
|
|
119
|
+
self->recordState.isStoppedByAction = false;
|
|
50
120
|
|
|
51
121
|
self->recordState.sliceIndex = 0;
|
|
52
122
|
self->recordState.transcribeSliceIndex = 0;
|
|
53
123
|
self->recordState.nSamplesTranscribing = 0;
|
|
54
124
|
|
|
55
|
-
|
|
56
|
-
self->recordState.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
125
|
+
self->recordState.sliceNSamples.clear();
|
|
126
|
+
self->recordState.sliceNSamples.push_back(0);
|
|
127
|
+
|
|
128
|
+
self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
|
|
129
|
+
self->recordState.job->set_realtime_params(
|
|
130
|
+
{
|
|
131
|
+
.use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
|
|
132
|
+
.vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
|
|
133
|
+
.vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
|
|
134
|
+
.freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
|
|
135
|
+
},
|
|
136
|
+
options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
|
|
137
|
+
options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
|
|
138
|
+
options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
|
|
139
|
+
options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
|
|
140
|
+
);
|
|
141
|
+
self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
|
|
68
142
|
|
|
69
143
|
self->recordState.mSelf = self;
|
|
70
144
|
}
|
|
71
145
|
|
|
72
|
-
|
|
73
|
-
if (self->recordState.shortBufferSlices != nil) {
|
|
74
|
-
for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
|
|
75
|
-
int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
|
|
76
|
-
free(audioBufferI16);
|
|
77
|
-
}
|
|
78
|
-
self->recordState.shortBufferSlices = nil;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
|
|
146
|
+
bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
|
|
83
147
|
{
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
|
|
87
|
-
int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
|
|
88
|
-
if (nSamples + n > sampleSize) {
|
|
89
|
-
int start = nSamples + n - sampleSize;
|
|
90
|
-
std::vector<float> audioBufferF32Vec(sampleSize);
|
|
91
|
-
for (int i = 0; i < sampleSize; i++) {
|
|
92
|
-
audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
|
|
93
|
-
}
|
|
94
|
-
float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
|
|
95
|
-
float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
96
|
-
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
|
|
97
|
-
NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
|
|
98
|
-
} else {
|
|
99
|
-
isSpeech = false;
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
return isSpeech;
|
|
148
|
+
if (state->isTranscribing) return true;
|
|
149
|
+
return state->job->vad_simple(sliceIndex, nSamples, n);
|
|
103
150
|
}
|
|
104
151
|
|
|
105
152
|
void AudioInputCallback(void * inUserData,
|
|
@@ -114,21 +161,21 @@ void AudioInputCallback(void * inUserData,
|
|
|
114
161
|
if (!state->isCapturing) {
|
|
115
162
|
NSLog(@"[RNWhisper] Not capturing, ignoring audio");
|
|
116
163
|
if (!state->isTranscribing) {
|
|
117
|
-
state->
|
|
164
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
118
165
|
}
|
|
119
166
|
return;
|
|
120
167
|
}
|
|
121
168
|
|
|
122
169
|
int totalNSamples = 0;
|
|
123
|
-
for (int i = 0; i <
|
|
124
|
-
totalNSamples +=
|
|
170
|
+
for (int i = 0; i < state->sliceNSamples.size(); i++) {
|
|
171
|
+
totalNSamples += state->sliceNSamples[i];
|
|
125
172
|
}
|
|
126
173
|
|
|
127
174
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
128
175
|
|
|
129
|
-
int nSamples =
|
|
176
|
+
int nSamples = state->sliceNSamples[state->sliceIndex];
|
|
130
177
|
|
|
131
|
-
if (totalNSamples + n > state->
|
|
178
|
+
if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
|
|
132
179
|
NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
|
|
133
180
|
state->isCapturing = false;
|
|
134
181
|
[state->mSelf stopAudio];
|
|
@@ -137,14 +184,14 @@ void AudioInputCallback(void * inUserData,
|
|
|
137
184
|
nSamples == state->nSamplesTranscribing &&
|
|
138
185
|
state->sliceIndex == state->transcribeSliceIndex
|
|
139
186
|
) {
|
|
140
|
-
state->
|
|
187
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
141
188
|
} else if (
|
|
142
189
|
!state->isTranscribing &&
|
|
143
190
|
nSamples != state->nSamplesTranscribing
|
|
144
191
|
) {
|
|
145
|
-
|
|
146
|
-
if (!vad(state,
|
|
147
|
-
state->
|
|
192
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
193
|
+
if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
|
|
194
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
148
195
|
return;
|
|
149
196
|
}
|
|
150
197
|
state->isTranscribing = true;
|
|
@@ -155,31 +202,25 @@ void AudioInputCallback(void * inUserData,
|
|
|
155
202
|
return;
|
|
156
203
|
}
|
|
157
204
|
|
|
158
|
-
|
|
159
|
-
if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
|
|
205
|
+
if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
|
|
160
206
|
// next slice
|
|
161
207
|
state->sliceIndex++;
|
|
162
208
|
nSamples = 0;
|
|
163
|
-
|
|
164
|
-
[state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
|
|
165
|
-
[state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
|
|
209
|
+
state->sliceNSamples.push_back(0);
|
|
166
210
|
}
|
|
167
211
|
|
|
168
|
-
|
|
169
|
-
NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
|
|
212
|
+
NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
|
|
170
213
|
|
|
171
|
-
|
|
172
|
-
for (int i = 0; i < n; i++) {
|
|
173
|
-
audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
174
|
-
}
|
|
214
|
+
state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
|
|
175
215
|
|
|
176
|
-
bool isSpeech = vad(state,
|
|
216
|
+
bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
|
|
177
217
|
nSamples += n;
|
|
178
|
-
state->sliceNSamples[state->sliceIndex] =
|
|
218
|
+
state->sliceNSamples[state->sliceIndex] = nSamples;
|
|
179
219
|
|
|
180
220
|
AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
|
|
181
221
|
|
|
182
|
-
|
|
222
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
223
|
+
if (!isSamplesEnough || !isSpeech) return;
|
|
183
224
|
|
|
184
225
|
if (!state->isTranscribing) {
|
|
185
226
|
state->isTranscribing = true;
|
|
@@ -189,21 +230,29 @@ void AudioInputCallback(void * inUserData,
|
|
|
189
230
|
}
|
|
190
231
|
}
|
|
191
232
|
|
|
233
|
+
- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
|
|
234
|
+
// Save wav if needed
|
|
235
|
+
if (state->job->audio_output_path != nullptr) {
|
|
236
|
+
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
237
|
+
rnaudioutils::save_wav_file(
|
|
238
|
+
rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
|
|
239
|
+
state->job->audio_output_path
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
state->transcribeHandler(state->job->job_id, @"end", result);
|
|
243
|
+
rnwhisper::job_remove(state->job->job_id);
|
|
244
|
+
}
|
|
245
|
+
|
|
192
246
|
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
193
|
-
int nSamplesOfIndex =
|
|
247
|
+
int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
194
248
|
state->nSamplesTranscribing = nSamplesOfIndex;
|
|
195
249
|
NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
|
|
196
250
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
// convert I16 to F32
|
|
200
|
-
for (int i = 0; i < state->nSamplesTranscribing; i++) {
|
|
201
|
-
audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
|
|
202
|
-
}
|
|
251
|
+
float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
|
|
252
|
+
|
|
203
253
|
CFTimeInterval timeStart = CACurrentMediaTime();
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
free(audioBufferF32);
|
|
254
|
+
int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
|
|
255
|
+
free(pcmf32);
|
|
207
256
|
CFTimeInterval timeEnd = CACurrentMediaTime();
|
|
208
257
|
const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
|
|
209
258
|
|
|
@@ -223,7 +272,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
223
272
|
result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
|
|
224
273
|
}
|
|
225
274
|
|
|
226
|
-
nSamplesOfIndex =
|
|
275
|
+
nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
227
276
|
|
|
228
277
|
bool isStopped = state->isStoppedByAction || (
|
|
229
278
|
!state->isCapturing &&
|
|
@@ -248,23 +297,13 @@ void AudioInputCallback(void * inUserData,
|
|
|
248
297
|
result[@"isStoppedByAction"] = @(state->isStoppedByAction);
|
|
249
298
|
result[@"isCapturing"] = @(false);
|
|
250
299
|
|
|
251
|
-
|
|
252
|
-
if (state->options[@"audioOutputPath"] != nil) {
|
|
253
|
-
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
254
|
-
[RNWhisperAudioUtils
|
|
255
|
-
saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
|
|
256
|
-
sliceNSamples:state->sliceNSamples]
|
|
257
|
-
audioOutputFile:state->options[@"audioOutputPath"]
|
|
258
|
-
];
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
state->transcribeHandler(state->jobId, @"end", result);
|
|
300
|
+
[state->mSelf finishRealtimeTranscribe:state result:result];
|
|
262
301
|
} else if (code == 0) {
|
|
263
302
|
result[@"isCapturing"] = @(true);
|
|
264
|
-
state->transcribeHandler(state->
|
|
303
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
265
304
|
} else {
|
|
266
305
|
result[@"isCapturing"] = @(true);
|
|
267
|
-
state->transcribeHandler(state->
|
|
306
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
268
307
|
}
|
|
269
308
|
|
|
270
309
|
if (continueNeeded) {
|
|
@@ -292,8 +331,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
292
331
|
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
|
|
293
332
|
{
|
|
294
333
|
self->recordState.transcribeHandler = onTranscribe;
|
|
295
|
-
self
|
|
296
|
-
[self prepareRealtime:options];
|
|
334
|
+
[self prepareRealtime:jobId options:options];
|
|
297
335
|
|
|
298
336
|
OSStatus status = AudioQueueNewInput(
|
|
299
337
|
&self->recordState.dataFormat,
|
|
@@ -321,9 +359,10 @@ void AudioInputCallback(void * inUserData,
|
|
|
321
359
|
struct rnwhisper_segments_callback_data {
|
|
322
360
|
void (^onNewSegments)(NSDictionary *);
|
|
323
361
|
int total_n_new;
|
|
362
|
+
bool tdrzEnable;
|
|
324
363
|
};
|
|
325
364
|
|
|
326
|
-
- (void)
|
|
365
|
+
- (void)transcribeData:(int)jobId
|
|
327
366
|
audioData:(float *)audioData
|
|
328
367
|
audioDataCount:(int)audioDataCount
|
|
329
368
|
options:(NSDictionary *)options
|
|
@@ -334,9 +373,9 @@ struct rnwhisper_segments_callback_data {
|
|
|
334
373
|
dispatch_async(dQueue, ^{
|
|
335
374
|
self->recordState.isStoppedByAction = false;
|
|
336
375
|
self->recordState.isTranscribing = true;
|
|
337
|
-
self->recordState.jobId = jobId;
|
|
338
376
|
|
|
339
|
-
whisper_full_params params = [self
|
|
377
|
+
whisper_full_params params = [self createParams:options jobId:jobId];
|
|
378
|
+
|
|
340
379
|
if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
|
|
341
380
|
params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
|
342
381
|
void (^onProgress)(int) = (__bridge void (^)(int))user_data;
|
|
@@ -354,12 +393,18 @@ struct rnwhisper_segments_callback_data {
|
|
|
354
393
|
NSMutableArray *segments = [[NSMutableArray alloc] init];
|
|
355
394
|
for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
|
|
356
395
|
const char * text_cur = whisper_full_get_segment_text(ctx, i);
|
|
357
|
-
|
|
396
|
+
NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
|
|
397
|
+
|
|
398
|
+
if (data->tdrzEnable && whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
|
399
|
+
[mutable_ns_text appendString:@" [SPEAKER_TURN]"];
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
text = [text stringByAppendingString:mutable_ns_text];
|
|
358
403
|
|
|
359
404
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
360
405
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
361
406
|
NSDictionary *segment = @{
|
|
362
|
-
@"text": [NSString
|
|
407
|
+
@"text": [NSString stringWithString:mutable_ns_text],
|
|
363
408
|
@"t0": [NSNumber numberWithLongLong:t0],
|
|
364
409
|
@"t1": [NSNumber numberWithLongLong:t1]
|
|
365
410
|
};
|
|
@@ -377,12 +422,16 @@ struct rnwhisper_segments_callback_data {
|
|
|
377
422
|
};
|
|
378
423
|
struct rnwhisper_segments_callback_data user_data = {
|
|
379
424
|
.onNewSegments = onNewSegments,
|
|
380
|
-
.
|
|
425
|
+
.tdrzEnable = options[@"tdrzEnable"] && [options[@"tdrzEnable"] boolValue],
|
|
426
|
+
.total_n_new = 0,
|
|
381
427
|
};
|
|
382
428
|
params.new_segment_callback_user_data = &user_data;
|
|
383
429
|
}
|
|
384
|
-
|
|
385
|
-
|
|
430
|
+
|
|
431
|
+
rnwhisper::job* job = rnwhisper::job_new(jobId, params);
|
|
432
|
+
self->recordState.job = job;
|
|
433
|
+
int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
|
|
434
|
+
rnwhisper::job_remove(jobId);
|
|
386
435
|
self->recordState.isTranscribing = false;
|
|
387
436
|
onEnd(code);
|
|
388
437
|
});
|
|
@@ -397,9 +446,13 @@ struct rnwhisper_segments_callback_data {
|
|
|
397
446
|
}
|
|
398
447
|
|
|
399
448
|
- (void)stopTranscribe:(int)jobId {
|
|
400
|
-
|
|
449
|
+
if (self->recordState.job) self->recordState.job->abort();
|
|
401
450
|
if (self->recordState.isRealtime && self->recordState.isCapturing) {
|
|
402
451
|
[self stopAudio];
|
|
452
|
+
if (!self->recordState.isTranscribing) {
|
|
453
|
+
// Handle for VAD case
|
|
454
|
+
self->recordState.transcribeHandler(jobId, @"end", @{});
|
|
455
|
+
}
|
|
403
456
|
}
|
|
404
457
|
self->recordState.isCapturing = false;
|
|
405
458
|
self->recordState.isStoppedByAction = true;
|
|
@@ -407,13 +460,11 @@ struct rnwhisper_segments_callback_data {
|
|
|
407
460
|
}
|
|
408
461
|
|
|
409
462
|
- (void)stopCurrentTranscribe {
|
|
410
|
-
if (
|
|
411
|
-
|
|
412
|
-
}
|
|
413
|
-
[self stopTranscribe:self->recordState.jobId];
|
|
463
|
+
if (self->recordState.job == nullptr) return;
|
|
464
|
+
[self stopTranscribe:self->recordState.job->job_id];
|
|
414
465
|
}
|
|
415
466
|
|
|
416
|
-
- (struct whisper_full_params)
|
|
467
|
+
- (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
|
|
417
468
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
418
469
|
|
|
419
470
|
const int n_threads = options[@"maxThreads"] != nil ?
|
|
@@ -432,9 +483,8 @@ struct rnwhisper_segments_callback_data {
|
|
|
432
483
|
params.print_progress = false;
|
|
433
484
|
params.print_timestamps = false;
|
|
434
485
|
params.print_special = false;
|
|
435
|
-
params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
|
|
436
486
|
params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
|
|
437
|
-
params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
|
|
487
|
+
params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
|
|
438
488
|
params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
|
439
489
|
params.offset_ms = 0;
|
|
440
490
|
params.no_context = true;
|
|
@@ -444,6 +494,7 @@ struct rnwhisper_segments_callback_data {
|
|
|
444
494
|
params.max_len = [options[@"maxLen"] intValue];
|
|
445
495
|
}
|
|
446
496
|
params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
|
|
497
|
+
params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false;
|
|
447
498
|
|
|
448
499
|
if (options[@"bestOf"] != nil) {
|
|
449
500
|
params.greedy.best_of = [options[@"bestOf"] intValue];
|
|
@@ -451,7 +502,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
451
502
|
if (options[@"maxContext"] != nil) {
|
|
452
503
|
params.n_max_text_ctx = [options[@"maxContext"] intValue];
|
|
453
504
|
}
|
|
454
|
-
|
|
455
505
|
if (options[@"offset"] != nil) {
|
|
456
506
|
params.offset_ms = [options[@"offset"] intValue];
|
|
457
507
|
}
|
|
@@ -467,35 +517,20 @@ struct rnwhisper_segments_callback_data {
|
|
|
467
517
|
if (options[@"temperatureInc"] != nil) {
|
|
468
518
|
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
469
519
|
}
|
|
470
|
-
|
|
471
520
|
if (options[@"prompt"] != nil) {
|
|
472
|
-
params.initial_prompt = [options[@"prompt"] UTF8String];
|
|
521
|
+
params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
|
|
473
522
|
}
|
|
474
523
|
|
|
475
|
-
// abort handler
|
|
476
|
-
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
477
|
-
bool is_aborted = *(bool*)user_data;
|
|
478
|
-
return !is_aborted;
|
|
479
|
-
};
|
|
480
|
-
params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(jobId);
|
|
481
|
-
params.abort_callback = [](void * user_data) {
|
|
482
|
-
bool is_aborted = *(bool*)user_data;
|
|
483
|
-
return is_aborted;
|
|
484
|
-
};
|
|
485
|
-
params.abort_callback_user_data = rn_whisper_assign_abort_map(jobId);
|
|
486
|
-
|
|
487
524
|
return params;
|
|
488
525
|
}
|
|
489
526
|
|
|
490
|
-
- (int)fullTranscribe:(
|
|
491
|
-
params:(struct whisper_full_params)params
|
|
527
|
+
- (int)fullTranscribe:(rnwhisper::job *)job
|
|
492
528
|
audioData:(float *)audioData
|
|
493
529
|
audioDataCount:(int)audioDataCount
|
|
494
530
|
{
|
|
495
531
|
whisper_reset_timings(self->ctx);
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
rn_whisper_remove_abort_map(jobId);
|
|
532
|
+
int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
|
|
533
|
+
if (job && job->is_aborted()) code = -999;
|
|
499
534
|
// if (code == 0) {
|
|
500
535
|
// whisper_print_timings(self->ctx);
|
|
501
536
|
// }
|
|
@@ -509,12 +544,21 @@ struct rnwhisper_segments_callback_data {
|
|
|
509
544
|
NSMutableArray *segments = [[NSMutableArray alloc] init];
|
|
510
545
|
for (int i = 0; i < n_segments; i++) {
|
|
511
546
|
const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
|
|
512
|
-
|
|
547
|
+
NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
|
|
548
|
+
|
|
549
|
+
// Simplified condition
|
|
550
|
+
if (self->recordState.options[@"tdrzEnable"] &&
|
|
551
|
+
[self->recordState.options[@"tdrzEnable"] boolValue] &&
|
|
552
|
+
whisper_full_get_segment_speaker_turn_next(self->ctx, i)) {
|
|
553
|
+
[mutable_ns_text appendString:@" [SPEAKER_TURN]"];
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
text = [text stringByAppendingString:mutable_ns_text];
|
|
513
557
|
|
|
514
558
|
const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
|
|
515
559
|
const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
|
|
516
560
|
NSDictionary *segment = @{
|
|
517
|
-
@"text": [NSString
|
|
561
|
+
@"text": [NSString stringWithString:mutable_ns_text],
|
|
518
562
|
@"t0": [NSNumber numberWithLongLong:t0],
|
|
519
563
|
@"t1": [NSNumber numberWithLongLong:t1]
|
|
520
564
|
};
|
|
@@ -526,10 +570,19 @@ struct rnwhisper_segments_callback_data {
|
|
|
526
570
|
return result;
|
|
527
571
|
}
|
|
528
572
|
|
|
573
|
+
- (NSString *)bench:(int)maxThreads {
|
|
574
|
+
const int n_threads = maxThreads > 0 ? maxThreads : 0;
|
|
575
|
+
|
|
576
|
+
const int max_threads = (int) [[NSProcessInfo processInfo] processorCount];
|
|
577
|
+
// Use 2 threads by default on 4-core devices, 4 threads on more cores
|
|
578
|
+
const int default_n_threads = max_threads == 4 ? 2 : MIN(4, max_threads);
|
|
579
|
+
NSString *result = [NSString stringWithUTF8String:rnwhisper::bench(self->ctx, n_threads).c_str()];
|
|
580
|
+
return result;
|
|
581
|
+
}
|
|
582
|
+
|
|
529
583
|
- (void)invalidate {
|
|
530
584
|
[self stopCurrentTranscribe];
|
|
531
585
|
whisper_free(self->ctx);
|
|
532
|
-
[self freeBufferIfNeeded];
|
|
533
586
|
}
|
|
534
587
|
|
|
535
588
|
@end
|
package/jest/mock.js
CHANGED
|
@@ -2,12 +2,17 @@ const { NativeModules, DeviceEventEmitter } = require('react-native')
|
|
|
2
2
|
|
|
3
3
|
if (!NativeModules.RNWhisper) {
|
|
4
4
|
NativeModules.RNWhisper = {
|
|
5
|
-
initContext: jest.fn(() => Promise.resolve(1)),
|
|
5
|
+
initContext: jest.fn(() => Promise.resolve({ contextId: 1 })),
|
|
6
6
|
transcribeFile: jest.fn(() => Promise.resolve({
|
|
7
7
|
result: ' Test',
|
|
8
8
|
segments: [{ text: ' Test', t0: 0, t1: 33 }],
|
|
9
9
|
isAborted: false,
|
|
10
10
|
})),
|
|
11
|
+
transcribeData: jest.fn(() => Promise.resolve({
|
|
12
|
+
result: ' Test',
|
|
13
|
+
segments: [{ text: ' Test', t0: 0, t1: 33 }],
|
|
14
|
+
isAborted: false,
|
|
15
|
+
})),
|
|
11
16
|
startRealtimeTranscribe: jest.fn((contextId, jobId) => {
|
|
12
17
|
setTimeout(() => {
|
|
13
18
|
// Start
|
|
@@ -45,11 +50,19 @@ if (!NativeModules.RNWhisper) {
|
|
|
45
50
|
})
|
|
46
51
|
})
|
|
47
52
|
}),
|
|
53
|
+
bench: jest.fn(() => Promise.resolve({
|
|
54
|
+
config: 'NEON',
|
|
55
|
+
nThreads: 1,
|
|
56
|
+
encodeMs: 1,
|
|
57
|
+
decodeMs: 1,
|
|
58
|
+
batchMs: 1,
|
|
59
|
+
promptMs: 1,
|
|
60
|
+
})),
|
|
48
61
|
releaseContext: jest.fn(() => Promise.resolve()),
|
|
49
62
|
releaseAllContexts: jest.fn(() => Promise.resolve()),
|
|
50
63
|
|
|
51
64
|
// iOS AudioSession utils
|
|
52
|
-
getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
|
|
65
|
+
getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
|
|
53
66
|
category: 'AVAudioSessionCategoryPlayAndRecord',
|
|
54
67
|
options: [],
|
|
55
68
|
})),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,
|
|
1
|
+
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0GnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
|