whisper.rn 0.4.0-rc.1 → 0.4.0-rc.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -82,8 +82,9 @@ public class WhisperContext {
|
|
|
82
82
|
private boolean vad(ReadableMap options, short[] shortBuffer, int nSamples, int n) {
|
|
83
83
|
boolean isSpeech = true;
|
|
84
84
|
if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
|
|
85
|
-
int
|
|
86
|
-
|
|
85
|
+
int vadMs = options.hasKey("vadMs") ? options.getInt("vadMs") : 2000;
|
|
86
|
+
if (vadMs < 2000) vadMs = 2000;
|
|
87
|
+
int sampleSize = (int) (SAMPLE_RATE * vadMs / 1000);
|
|
87
88
|
if (nSamples + n > sampleSize) {
|
|
88
89
|
int start = nSamples + n - sampleSize;
|
|
89
90
|
float[] audioData = new float[sampleSize];
|
|
@@ -100,6 +101,21 @@ public class WhisperContext {
|
|
|
100
101
|
return isSpeech;
|
|
101
102
|
}
|
|
102
103
|
|
|
104
|
+
private void finishRealtimeTranscribe(ReadableMap options, WritableMap result) {
|
|
105
|
+
String audioOutputPath = options.hasKey("audioOutputPath") ? options.getString("audioOutputPath") : null;
|
|
106
|
+
if (audioOutputPath != null) {
|
|
107
|
+
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
108
|
+
Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
|
|
109
|
+
try {
|
|
110
|
+
AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
|
|
111
|
+
} catch (IOException e) {
|
|
112
|
+
Log.e(NAME, "Error saving wav file: " + e.getMessage());
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
|
|
117
|
+
}
|
|
118
|
+
|
|
103
119
|
public int startRealtimeTranscribe(int jobId, ReadableMap options) {
|
|
104
120
|
if (isCapturing || isTranscribing) {
|
|
105
121
|
return -100;
|
|
@@ -131,7 +147,7 @@ public class WhisperContext {
|
|
|
131
147
|
shortBufferSlices.add(new short[audioSliceSec * SAMPLE_RATE]);
|
|
132
148
|
sliceNSamples = new ArrayList<Integer>();
|
|
133
149
|
sliceNSamples.add(0);
|
|
134
|
-
|
|
150
|
+
|
|
135
151
|
isCapturing = true;
|
|
136
152
|
recorder.startRecording();
|
|
137
153
|
|
|
@@ -159,12 +175,12 @@ public class WhisperContext {
|
|
|
159
175
|
nSamples == nSamplesTranscribing &&
|
|
160
176
|
sliceIndex == transcribeSliceIndex
|
|
161
177
|
) {
|
|
162
|
-
|
|
178
|
+
finishRealtimeTranscribe(options, Arguments.createMap());
|
|
163
179
|
} else if (!isTranscribing) {
|
|
164
180
|
short[] shortBuffer = shortBufferSlices.get(sliceIndex);
|
|
165
181
|
boolean isSpeech = vad(options, shortBuffer, nSamples, 0);
|
|
166
182
|
if (!isSpeech) {
|
|
167
|
-
|
|
183
|
+
finishRealtimeTranscribe(options, Arguments.createMap());
|
|
168
184
|
break;
|
|
169
185
|
}
|
|
170
186
|
isTranscribing = true;
|
|
@@ -210,11 +226,9 @@ public class WhisperContext {
|
|
|
210
226
|
Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
|
|
211
227
|
}
|
|
212
228
|
}
|
|
213
|
-
|
|
214
|
-
Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
|
|
215
|
-
AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
|
|
229
|
+
|
|
216
230
|
if (!isTranscribing) {
|
|
217
|
-
|
|
231
|
+
finishRealtimeTranscribe(options, Arguments.createMap());
|
|
218
232
|
}
|
|
219
233
|
if (fullHandler != null) {
|
|
220
234
|
fullHandler.join(); // Wait for full transcribe to finish
|
|
@@ -288,7 +302,7 @@ public class WhisperContext {
|
|
|
288
302
|
if (isStopped && !continueNeeded) {
|
|
289
303
|
payload.putBoolean("isCapturing", false);
|
|
290
304
|
payload.putBoolean("isStoppedByAction", isStoppedByAction);
|
|
291
|
-
|
|
305
|
+
finishRealtimeTranscribe(options, payload);
|
|
292
306
|
} else if (code == 0) {
|
|
293
307
|
payload.putBoolean("isCapturing", true);
|
|
294
308
|
emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
|
|
@@ -401,7 +415,7 @@ public class WhisperContext {
|
|
|
401
415
|
options.hasKey("maxLen") ? options.getInt("maxLen") : -1,
|
|
402
416
|
// jboolean token_timestamps,
|
|
403
417
|
options.hasKey("tokenTimestamps") ? options.getBoolean("tokenTimestamps") : false,
|
|
404
|
-
|
|
418
|
+
|
|
405
419
|
// jint offset,
|
|
406
420
|
options.hasKey("offset") ? options.getInt("offset") : -1,
|
|
407
421
|
// jint duration,
|
|
@@ -577,4 +591,4 @@ public class WhisperContext {
|
|
|
577
591
|
protected static native int getTextSegmentT0(long context, int index);
|
|
578
592
|
protected static native int getTextSegmentT1(long context, int index);
|
|
579
593
|
protected static native void freeContext(long contextPtr);
|
|
580
|
-
}
|
|
594
|
+
}
|
package/ios/RNWhisperContext.h
CHANGED
|
@@ -27,6 +27,12 @@ typedef struct {
|
|
|
27
27
|
int sliceIndex;
|
|
28
28
|
int transcribeSliceIndex;
|
|
29
29
|
int audioSliceSec;
|
|
30
|
+
NSString* audioOutputPath;
|
|
31
|
+
|
|
32
|
+
bool useVad;
|
|
33
|
+
int vadMs;
|
|
34
|
+
float vadThold;
|
|
35
|
+
float vadFreqThold;
|
|
30
36
|
|
|
31
37
|
AudioQueueRef queue;
|
|
32
38
|
AudioStreamBasicDescription dataFormat;
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -53,6 +53,15 @@
|
|
|
53
53
|
int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
|
|
54
54
|
int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
|
|
55
55
|
|
|
56
|
+
self->recordState.audioOutputPath = options[@"audioOutputPath"];
|
|
57
|
+
|
|
58
|
+
self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
|
|
59
|
+
self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
|
|
60
|
+
if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
|
|
61
|
+
|
|
62
|
+
self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
|
|
63
|
+
self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
64
|
+
|
|
56
65
|
self->recordState.audioSliceSec = audioSliceSec;
|
|
57
66
|
self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
|
|
58
67
|
|
|
@@ -90,18 +99,15 @@
|
|
|
90
99
|
bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
|
|
91
100
|
{
|
|
92
101
|
bool isSpeech = true;
|
|
93
|
-
if (!state->isTranscribing && state->
|
|
94
|
-
int
|
|
95
|
-
int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
|
|
102
|
+
if (!state->isTranscribing && state->useVad) {
|
|
103
|
+
int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
|
|
96
104
|
if (nSamples + n > sampleSize) {
|
|
97
105
|
int start = nSamples + n - sampleSize;
|
|
98
106
|
std::vector<float> audioBufferF32Vec(sampleSize);
|
|
99
107
|
for (int i = 0; i < sampleSize; i++) {
|
|
100
108
|
audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
|
|
101
109
|
}
|
|
102
|
-
|
|
103
|
-
float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
104
|
-
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
|
|
110
|
+
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
|
|
105
111
|
NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
|
|
106
112
|
} else {
|
|
107
113
|
isSpeech = false;
|
|
@@ -122,7 +128,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
122
128
|
if (!state->isCapturing) {
|
|
123
129
|
NSLog(@"[RNWhisper] Not capturing, ignoring audio");
|
|
124
130
|
if (!state->isTranscribing) {
|
|
125
|
-
state->
|
|
131
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
126
132
|
}
|
|
127
133
|
return;
|
|
128
134
|
}
|
|
@@ -145,14 +151,14 @@ void AudioInputCallback(void * inUserData,
|
|
|
145
151
|
nSamples == state->nSamplesTranscribing &&
|
|
146
152
|
state->sliceIndex == state->transcribeSliceIndex
|
|
147
153
|
) {
|
|
148
|
-
state->
|
|
154
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
149
155
|
} else if (
|
|
150
156
|
!state->isTranscribing &&
|
|
151
157
|
nSamples != state->nSamplesTranscribing
|
|
152
158
|
) {
|
|
153
159
|
int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
|
|
154
160
|
if (!vad(state, audioBufferI16, nSamples, 0)) {
|
|
155
|
-
state->
|
|
161
|
+
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
156
162
|
return;
|
|
157
163
|
}
|
|
158
164
|
state->isTranscribing = true;
|
|
@@ -197,6 +203,19 @@ void AudioInputCallback(void * inUserData,
|
|
|
197
203
|
}
|
|
198
204
|
}
|
|
199
205
|
|
|
206
|
+
- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
|
|
207
|
+
// Save wav if needed
|
|
208
|
+
if (state->audioOutputPath != nil) {
|
|
209
|
+
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
210
|
+
[RNWhisperAudioUtils
|
|
211
|
+
saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
|
|
212
|
+
sliceNSamples:state->sliceNSamples]
|
|
213
|
+
audioOutputFile:state->audioOutputPath
|
|
214
|
+
];
|
|
215
|
+
}
|
|
216
|
+
state->transcribeHandler(state->jobId, @"end", result);
|
|
217
|
+
}
|
|
218
|
+
|
|
200
219
|
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
201
220
|
int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
|
|
202
221
|
state->nSamplesTranscribing = nSamplesOfIndex;
|
|
@@ -256,17 +275,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
256
275
|
result[@"isStoppedByAction"] = @(state->isStoppedByAction);
|
|
257
276
|
result[@"isCapturing"] = @(false);
|
|
258
277
|
|
|
259
|
-
|
|
260
|
-
if (state->options[@"audioOutputPath"] != nil) {
|
|
261
|
-
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
262
|
-
[RNWhisperAudioUtils
|
|
263
|
-
saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
|
|
264
|
-
sliceNSamples:state->sliceNSamples]
|
|
265
|
-
audioOutputFile:state->options[@"audioOutputPath"]
|
|
266
|
-
];
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
state->transcribeHandler(state->jobId, @"end", result);
|
|
278
|
+
[state->mSelf finishRealtimeTranscribe:state result:result];
|
|
270
279
|
} else if (code == 0) {
|
|
271
280
|
result[@"isCapturing"] = @(true);
|
|
272
281
|
state->transcribeHandler(state->jobId, @"transcribe", result);
|
|
@@ -408,6 +417,10 @@ struct rnwhisper_segments_callback_data {
|
|
|
408
417
|
rn_whisper_abort_transcribe(jobId);
|
|
409
418
|
if (self->recordState.isRealtime && self->recordState.isCapturing) {
|
|
410
419
|
[self stopAudio];
|
|
420
|
+
if (!self->recordState.isTranscribing) {
|
|
421
|
+
// Handle for VAD case
|
|
422
|
+
self->recordState.transcribeHandler(jobId, @"end", @{});
|
|
423
|
+
}
|
|
411
424
|
}
|
|
412
425
|
self->recordState.isCapturing = false;
|
|
413
426
|
self->recordState.isStoppedByAction = true;
|
|
@@ -59,7 +59,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
|
|
|
59
59
|
*/
|
|
60
60
|
useVad?: boolean;
|
|
61
61
|
/**
|
|
62
|
-
* The length of the collected audio is used for VAD. (ms) (Default: 2000)
|
|
62
|
+
* The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
|
|
63
63
|
*/
|
|
64
64
|
vadMs?: number;
|
|
65
65
|
/**
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -106,7 +106,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
|
|
|
106
106
|
*/
|
|
107
107
|
useVad?: boolean
|
|
108
108
|
/**
|
|
109
|
-
* The length of the collected audio is used for VAD. (ms) (Default: 2000)
|
|
109
|
+
* The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
|
|
110
110
|
*/
|
|
111
111
|
vadMs?: number
|
|
112
112
|
/**
|