whisper.rn 0.4.0-rc.1 → 0.4.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,8 +82,9 @@ public class WhisperContext {
82
82
  private boolean vad(ReadableMap options, short[] shortBuffer, int nSamples, int n) {
83
83
  boolean isSpeech = true;
84
84
  if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
85
- int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2;
86
- int sampleSize = vadSec * SAMPLE_RATE;
85
+ int vadMs = options.hasKey("vadMs") ? options.getInt("vadMs") : 2000;
86
+ if (vadMs < 2000) vadMs = 2000;
87
+ int sampleSize = (int) (SAMPLE_RATE * vadMs / 1000);
87
88
  if (nSamples + n > sampleSize) {
88
89
  int start = nSamples + n - sampleSize;
89
90
  float[] audioData = new float[sampleSize];
@@ -100,6 +101,21 @@ public class WhisperContext {
100
101
  return isSpeech;
101
102
  }
102
103
 
104
+ private void finishRealtimeTranscribe(ReadableMap options, WritableMap result) {
105
+ String audioOutputPath = options.hasKey("audioOutputPath") ? options.getString("audioOutputPath") : null;
106
+ if (audioOutputPath != null) {
107
+ // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
108
+ Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
109
+ try {
110
+ AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
111
+ } catch (IOException e) {
112
+ Log.e(NAME, "Error saving wav file: " + e.getMessage());
113
+ }
114
+ }
115
+
116
+ emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
117
+ }
118
+
103
119
  public int startRealtimeTranscribe(int jobId, ReadableMap options) {
104
120
  if (isCapturing || isTranscribing) {
105
121
  return -100;
@@ -131,7 +147,7 @@ public class WhisperContext {
131
147
  shortBufferSlices.add(new short[audioSliceSec * SAMPLE_RATE]);
132
148
  sliceNSamples = new ArrayList<Integer>();
133
149
  sliceNSamples.add(0);
134
-
150
+
135
151
  isCapturing = true;
136
152
  recorder.startRecording();
137
153
 
@@ -159,12 +175,12 @@ public class WhisperContext {
159
175
  nSamples == nSamplesTranscribing &&
160
176
  sliceIndex == transcribeSliceIndex
161
177
  ) {
162
- emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
178
+ finishRealtimeTranscribe(options, Arguments.createMap());
163
179
  } else if (!isTranscribing) {
164
180
  short[] shortBuffer = shortBufferSlices.get(sliceIndex);
165
181
  boolean isSpeech = vad(options, shortBuffer, nSamples, 0);
166
182
  if (!isSpeech) {
167
- emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
183
+ finishRealtimeTranscribe(options, Arguments.createMap());
168
184
  break;
169
185
  }
170
186
  isTranscribing = true;
@@ -210,11 +226,9 @@ public class WhisperContext {
210
226
  Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
211
227
  }
212
228
  }
213
- // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
214
- Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
215
- AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
229
+
216
230
  if (!isTranscribing) {
217
- emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
231
+ finishRealtimeTranscribe(options, Arguments.createMap());
218
232
  }
219
233
  if (fullHandler != null) {
220
234
  fullHandler.join(); // Wait for full transcribe to finish
@@ -288,7 +302,7 @@ public class WhisperContext {
288
302
  if (isStopped && !continueNeeded) {
289
303
  payload.putBoolean("isCapturing", false);
290
304
  payload.putBoolean("isStoppedByAction", isStoppedByAction);
291
- emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", payload);
305
+ finishRealtimeTranscribe(options, payload);
292
306
  } else if (code == 0) {
293
307
  payload.putBoolean("isCapturing", true);
294
308
  emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
@@ -401,7 +415,7 @@ public class WhisperContext {
401
415
  options.hasKey("maxLen") ? options.getInt("maxLen") : -1,
402
416
  // jboolean token_timestamps,
403
417
  options.hasKey("tokenTimestamps") ? options.getBoolean("tokenTimestamps") : false,
404
-
418
+
405
419
  // jint offset,
406
420
  options.hasKey("offset") ? options.getInt("offset") : -1,
407
421
  // jint duration,
@@ -577,4 +591,4 @@ public class WhisperContext {
577
591
  protected static native int getTextSegmentT0(long context, int index);
578
592
  protected static native int getTextSegmentT1(long context, int index);
579
593
  protected static native void freeContext(long contextPtr);
580
- }
594
+ }
@@ -27,6 +27,12 @@ typedef struct {
27
27
  int sliceIndex;
28
28
  int transcribeSliceIndex;
29
29
  int audioSliceSec;
30
+ NSString* audioOutputPath;
31
+
32
+ bool useVad;
33
+ int vadMs;
34
+ float vadThold;
35
+ float vadFreqThold;
30
36
 
31
37
  AudioQueueRef queue;
32
38
  AudioStreamBasicDescription dataFormat;
@@ -53,6 +53,15 @@
53
53
  int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
54
54
  int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
55
55
 
56
+ self->recordState.audioOutputPath = options[@"audioOutputPath"];
57
+
58
+ self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
59
+ self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
60
+ if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
61
+
62
+ self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
63
+ self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
64
+
56
65
  self->recordState.audioSliceSec = audioSliceSec;
57
66
  self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
58
67
 
@@ -90,18 +99,15 @@
90
99
  bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
91
100
  {
92
101
  bool isSpeech = true;
93
- if (!state->isTranscribing && state->options[@"useVad"]) {
94
- int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
95
- int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
102
+ if (!state->isTranscribing && state->useVad) {
103
+ int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
96
104
  if (nSamples + n > sampleSize) {
97
105
  int start = nSamples + n - sampleSize;
98
106
  std::vector<float> audioBufferF32Vec(sampleSize);
99
107
  for (int i = 0; i < sampleSize; i++) {
100
108
  audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
101
109
  }
102
- float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
103
- float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
104
- isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
110
+ isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
105
111
  NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
106
112
  } else {
107
113
  isSpeech = false;
@@ -122,7 +128,7 @@ void AudioInputCallback(void * inUserData,
122
128
  if (!state->isCapturing) {
123
129
  NSLog(@"[RNWhisper] Not capturing, ignoring audio");
124
130
  if (!state->isTranscribing) {
125
- state->transcribeHandler(state->jobId, @"end", @{});
131
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
126
132
  }
127
133
  return;
128
134
  }
@@ -145,14 +151,14 @@ void AudioInputCallback(void * inUserData,
145
151
  nSamples == state->nSamplesTranscribing &&
146
152
  state->sliceIndex == state->transcribeSliceIndex
147
153
  ) {
148
- state->transcribeHandler(state->jobId, @"end", @{});
154
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
149
155
  } else if (
150
156
  !state->isTranscribing &&
151
157
  nSamples != state->nSamplesTranscribing
152
158
  ) {
153
159
  int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
154
160
  if (!vad(state, audioBufferI16, nSamples, 0)) {
155
- state->transcribeHandler(state->jobId, @"end", @{});
161
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
156
162
  return;
157
163
  }
158
164
  state->isTranscribing = true;
@@ -197,6 +203,19 @@ void AudioInputCallback(void * inUserData,
197
203
  }
198
204
  }
199
205
 
206
+ - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
207
+ // Save wav if needed
208
+ if (state->audioOutputPath != nil) {
209
+ // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
210
+ [RNWhisperAudioUtils
211
+ saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
212
+ sliceNSamples:state->sliceNSamples]
213
+ audioOutputFile:state->audioOutputPath
214
+ ];
215
+ }
216
+ state->transcribeHandler(state->jobId, @"end", result);
217
+ }
218
+
200
219
  - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
201
220
  int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
202
221
  state->nSamplesTranscribing = nSamplesOfIndex;
@@ -256,17 +275,7 @@ void AudioInputCallback(void * inUserData,
256
275
  result[@"isStoppedByAction"] = @(state->isStoppedByAction);
257
276
  result[@"isCapturing"] = @(false);
258
277
 
259
- // Save wav if needed
260
- if (state->options[@"audioOutputPath"] != nil) {
261
- // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
262
- [RNWhisperAudioUtils
263
- saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
264
- sliceNSamples:state->sliceNSamples]
265
- audioOutputFile:state->options[@"audioOutputPath"]
266
- ];
267
- }
268
-
269
- state->transcribeHandler(state->jobId, @"end", result);
278
+ [state->mSelf finishRealtimeTranscribe:state result:result];
270
279
  } else if (code == 0) {
271
280
  result[@"isCapturing"] = @(true);
272
281
  state->transcribeHandler(state->jobId, @"transcribe", result);
@@ -408,6 +417,10 @@ struct rnwhisper_segments_callback_data {
408
417
  rn_whisper_abort_transcribe(jobId);
409
418
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
410
419
  [self stopAudio];
420
+ if (!self->recordState.isTranscribing) {
421
+ // Handle for VAD case
422
+ self->recordState.transcribeHandler(jobId, @"end", @{});
423
+ }
411
424
  }
412
425
  self->recordState.isCapturing = false;
413
426
  self->recordState.isStoppedByAction = true;
@@ -59,7 +59,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
59
59
  */
60
60
  useVad?: boolean;
61
61
  /**
62
- * The length of the collected audio is used for VAD. (ms) (Default: 2000)
62
+ * The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
63
63
  */
64
64
  vadMs?: number;
65
65
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.rn",
3
- "version": "0.4.0-rc.1",
3
+ "version": "0.4.0-rc.3",
4
4
  "description": "React Native binding of whisper.cpp",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",
package/src/index.ts CHANGED
@@ -106,7 +106,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
106
106
  */
107
107
  useVad?: boolean
108
108
  /**
109
- * The length of the collected audio is used for VAD. (ms) (Default: 2000)
109
+ * The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
110
110
  */
111
111
  vadMs?: number
112
112
  /**