npm - whisper.rn - Versions diffs - 0.4.0-rc.1 → 0.4.0-rc.3 - Mend

whisper.rn 0.4.0-rc.1 → 0.4.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/android/src/main/java/com/rnwhisper/WhisperContext.java +26 -12
package/ios/RNWhisperContext.h +6 -0
package/ios/RNWhisperContext.mm +33 -20
package/lib/typescript/index.d.ts +1 -1
package/package.json +1 -1
package/src/index.ts +1 -1

package/android/src/main/java/com/rnwhisper/WhisperContext.java CHANGED Viewed

@@ -82,8 +82,9 @@ public class WhisperContext {
   private boolean vad(ReadableMap options, short[] shortBuffer, int nSamples, int n) {
     boolean isSpeech = true;
     if (!isTranscribing && options.hasKey("useVad") && options.getBoolean("useVad")) {
-      int vadSec = options.hasKey("vadMs") ? options.getInt("vadMs") / 1000 : 2;
-      int sampleSize = vadSec * SAMPLE_RATE;
+      int vadMs = options.hasKey("vadMs") ? options.getInt("vadMs") : 2000;
+      if (vadMs < 2000) vadMs = 2000;
+      int sampleSize = (int) (SAMPLE_RATE * vadMs / 1000);
       if (nSamples + n > sampleSize) {
         int start = nSamples + n - sampleSize;
         float[] audioData = new float[sampleSize];
@@ -100,6 +101,21 @@ public class WhisperContext {
     return isSpeech;
   }
+  private void finishRealtimeTranscribe(ReadableMap options, WritableMap result) {
+    String audioOutputPath = options.hasKey("audioOutputPath") ? options.getString("audioOutputPath") : null;
+    if (audioOutputPath != null) {
+       // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
+      Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
+      try {
+        AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
+      } catch (IOException e) {
+        Log.e(NAME, "Error saving wav file: " + e.getMessage());
+      }
+    }
+    emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+  }
   public int startRealtimeTranscribe(int jobId, ReadableMap options) {
     if (isCapturing || isTranscribing) {
       return -100;
@@ -131,7 +147,7 @@ public class WhisperContext {
     shortBufferSlices.add(new short[audioSliceSec * SAMPLE_RATE]);
     sliceNSamples = new ArrayList<Integer>();
     sliceNSamples.add(0);
     isCapturing = true;
     recorder.startRecording();
@@ -159,12 +175,12 @@ public class WhisperContext {
                   nSamples == nSamplesTranscribing &&
                   sliceIndex == transcribeSliceIndex
                 ) {
-                  emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+                  finishRealtimeTranscribe(options, Arguments.createMap());
                 } else if (!isTranscribing) {
                   short[] shortBuffer = shortBufferSlices.get(sliceIndex);
                   boolean isSpeech = vad(options, shortBuffer, nSamples, 0);
                   if (!isSpeech) {
-                    emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+                    finishRealtimeTranscribe(options, Arguments.createMap());
                     break;
                   }
                   isTranscribing = true;
@@ -210,11 +226,9 @@ public class WhisperContext {
               Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
             }
           }
-          // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
-          Log.d(NAME, "Begin saving wav file to " + audioOutputPath);
-          AudioUtils.saveWavFile(AudioUtils.concatShortBuffers(shortBufferSlices), audioOutputPath);
           if (!isTranscribing) {
-            emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+            finishRealtimeTranscribe(options, Arguments.createMap());
           }
           if (fullHandler != null) {
             fullHandler.join(); // Wait for full transcribe to finish
@@ -288,7 +302,7 @@ public class WhisperContext {
     if (isStopped && !continueNeeded) {
       payload.putBoolean("isCapturing", false);
       payload.putBoolean("isStoppedByAction", isStoppedByAction);
-      emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", payload);
+      finishRealtimeTranscribe(options, payload);
     } else if (code == 0) {
       payload.putBoolean("isCapturing", true);
       emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
@@ -401,7 +415,7 @@ public class WhisperContext {
       options.hasKey("maxLen") ? options.getInt("maxLen") : -1,
       // jboolean token_timestamps,
       options.hasKey("tokenTimestamps") ? options.getBoolean("tokenTimestamps") : false,
       // jint offset,
       options.hasKey("offset") ? options.getInt("offset") : -1,
       // jint duration,
@@ -577,4 +591,4 @@ public class WhisperContext {
   protected static native int getTextSegmentT0(long context, int index);
   protected static native int getTextSegmentT1(long context, int index);
   protected static native void freeContext(long contextPtr);
-}
+}

package/ios/RNWhisperContext.h CHANGED Viewed

@@ -27,6 +27,12 @@ typedef struct {
     int sliceIndex;
     int transcribeSliceIndex;
     int audioSliceSec;
+    NSString* audioOutputPath;
+    bool useVad;
+    int vadMs;
+    float vadThold;
+    float vadFreqThold;
     AudioQueueRef queue;
     AudioStreamBasicDescription dataFormat;

package/ios/RNWhisperContext.mm CHANGED Viewed

@@ -53,6 +53,15 @@
     int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
     int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
+    self->recordState.audioOutputPath = options[@"audioOutputPath"];
+    self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
+    self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
+    if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
+    self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
+    self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
     self->recordState.audioSliceSec = audioSliceSec;
     self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
@@ -90,18 +99,15 @@
 bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
 {
     bool isSpeech = true;
-    if (!state->isTranscribing && state->options[@"useVad"]) {
-        int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
-        int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
+    if (!state->isTranscribing && state->useVad) {
+        int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
         if (nSamples + n > sampleSize) {
             int start = nSamples + n - sampleSize;
             std::vector<float> audioBufferF32Vec(sampleSize);
             for (int i = 0; i < sampleSize; i++) {
                 audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
             }
-            float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
-            float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
-            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
+            isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
             NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
         } else {
             isSpeech = false;
@@ -122,7 +128,7 @@ void AudioInputCallback(void * inUserData,
     if (!state->isCapturing) {
         NSLog(@"[RNWhisper] Not capturing, ignoring audio");
         if (!state->isTranscribing) {
-            state->transcribeHandler(state->jobId, @"end", @{});
+            [state->mSelf finishRealtimeTranscribe:state result:@{}];
         }
         return;
     }
@@ -145,14 +151,14 @@ void AudioInputCallback(void * inUserData,
             nSamples == state->nSamplesTranscribing &&
             state->sliceIndex == state->transcribeSliceIndex
         ) {
-            state->transcribeHandler(state->jobId, @"end", @{});
+            [state->mSelf finishRealtimeTranscribe:state result:@{}];
         } else if (
             !state->isTranscribing &&
             nSamples != state->nSamplesTranscribing
         ) {
             int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
             if (!vad(state, audioBufferI16, nSamples, 0)) {
-                state->transcribeHandler(state->jobId, @"end", @{});
+                [state->mSelf finishRealtimeTranscribe:state result:@{}];
                 return;
             }
             state->isTranscribing = true;
@@ -197,6 +203,19 @@ void AudioInputCallback(void * inUserData,
     }
 }
+- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
+    // Save wav if needed
+    if (state->audioOutputPath != nil) {
+        // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
+        [RNWhisperAudioUtils
+            saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
+                            sliceNSamples:state->sliceNSamples]
+            audioOutputFile:state->audioOutputPath
+        ];
+    }
+    state->transcribeHandler(state->jobId, @"end", result);
+}
 - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
     int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
     state->nSamplesTranscribing = nSamplesOfIndex;
@@ -256,17 +275,7 @@ void AudioInputCallback(void * inUserData,
         result[@"isStoppedByAction"] = @(state->isStoppedByAction);
         result[@"isCapturing"] = @(false);
-        // Save wav if needed
-        if (state->options[@"audioOutputPath"] != nil) {
-            // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
-            [RNWhisperAudioUtils
-                saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
-                                sliceNSamples:state->sliceNSamples]
-                audioOutputFile:state->options[@"audioOutputPath"]
-            ];
-        }
-        state->transcribeHandler(state->jobId, @"end", result);
+        [state->mSelf finishRealtimeTranscribe:state result:result];
     } else if (code == 0) {
         result[@"isCapturing"] = @(true);
         state->transcribeHandler(state->jobId, @"transcribe", result);
@@ -408,6 +417,10 @@ struct rnwhisper_segments_callback_data {
     rn_whisper_abort_transcribe(jobId);
     if (self->recordState.isRealtime && self->recordState.isCapturing) {
         [self stopAudio];
+        if (!self->recordState.isTranscribing) {
+            // Handle for VAD case
+            self->recordState.transcribeHandler(jobId, @"end", @{});
+        }
     }
     self->recordState.isCapturing = false;
     self->recordState.isStoppedByAction = true;

package/lib/typescript/index.d.ts CHANGED Viewed

@@ -59,7 +59,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
      */
     useVad?: boolean;
     /**
-     * The length of the collected audio is used for VAD. (ms) (Default: 2000)
+     * The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
      */
     vadMs?: number;
     /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "whisper.rn",
-  "version": "0.4.0-rc.1",
+  "version": "0.4.0-rc.3",
   "description": "React Native binding of whisper.cpp",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",

package/src/index.ts CHANGED Viewed

@@ -106,7 +106,7 @@ export type TranscribeRealtimeOptions = TranscribeOptions & {
    */
   useVad?: boolean
   /**
-   * The length of the collected audio is used for VAD. (ms) (Default: 2000)
+   * The length of the collected audio is used for VAD, cannot be less than 2000ms. (ms) (Default: 2000)
    */
   vadMs?: number
   /**