npm - whisper.rn - Versions diffs - 0.1.5 → 0.2.1 - Mend

whisper.rn 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +43 -4
package/android/build.gradle +2 -4
package/android/src/main/java/com/rnwhisper/RNWhisperModule.java +47 -7
package/android/src/main/java/com/rnwhisper/WhisperContext.java +224 -7
package/android/src/main/jni/whisper/Whisper.mk +1 -1
package/android/src/main/jni/whisper/jni.cpp +34 -5
package/cpp/rn-whisper.cpp +26 -0
package/cpp/rn-whisper.h +5 -0
package/ios/RNWhisper.h +2 -2
package/ios/RNWhisper.mm +78 -111
package/ios/RNWhisperContext.h +55 -0
package/ios/RNWhisperContext.mm +326 -0
package/jest/mock.js +43 -2
package/lib/commonjs/index.js +59 -2
package/lib/commonjs/index.js.map +1 -1
package/lib/module/index.js +60 -3
package/lib/module/index.js.map +1 -1
package/lib/typescript/index.d.ts +63 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/index.tsx +124 -4

package/README.md CHANGED Viewed

@@ -20,6 +20,23 @@ npm install whisper.rn
 Then re-run `npx pod-install` again for iOS.
+## Add Microphone Permissions (Optional)
+If you want to use realtime transcribe, you need to add the microphone permission to your app.
+### iOS
+Add these lines to ```ios/[YOU_APP_NAME]/info.plist```
+```xml
+<key>NSMicrophoneUsageDescription</key>
+<string>This app requires microphone access in order to transcribe speech</string>
+```
+### Android
+Add the following line to ```android/app/src/main/AndroidManifest.xml```
+```xml
+<uses-permission android:name="android.permission.RECORD_AUDIO" />
+```
 ## Usage
 ```js
@@ -30,13 +47,35 @@ const sampleFilePath = 'file://.../sample.wav'
 const whisperContext = await initWhisper({ filePath })
-const { result } = await whisperContext.transcribe(sampleFilePath, {
-  language: 'en',
-  // More options
-})
+const options = { language: 'en' }
+const { stop, promise } = whisperContext.transcribe(sampleFilePath, options)
+const { result } = await promise
 // result: (The inference text result from audio file)
 ```
+Use realtime transcribe:
+```js
+const { stop, subscribe } = await whisperContext.transcribeRealtime(options)
+subscribe(evt => {
+  const { isCapturing, data, processTime, recordingTime } = evt
+  console.log(
+    `Realtime transcribing: ${isCapturing ? 'ON' : 'OFF'}\n` +
+      // The inference text result from audio record:
+      `Result: ${data.result}\n\n` +
+      `Process time: ${processTime}ms\n` +
+      `Recording time: ${recordingTime}ms`,
+  )
+  if (!isCapturing) console.log('Finished realtime transcribing')
+})
+```
+In Android, you may need to request the microphone permission by [`PermissionAndroid`](https://reactnative.dev/docs/permissionsandroid).
+The documentation is not ready yet, please see the comments of [index](./src/index.tsx) file for more details at the moment.
 ## Run with example
 The example app is using [react-native-fs](https://github.com/itinance/react-native-fs) to download the model file and audio file.

package/android/build.gradle CHANGED Viewed

@@ -40,10 +40,8 @@ android {
     buildConfigField "boolean", "IS_NEW_ARCHITECTURE_ENABLED", isNewArchitectureEnabled().toString()
   }
   externalNativeBuild {
-    externalNativeBuild {
-      ndkBuild {
-        path 'src/main/jni/whisper/Android.mk'
-      }
+    ndkBuild {
+      path 'src/main/jni/whisper/Android.mk'
     }
   }
   buildTypes {

package/android/src/main/java/com/rnwhisper/RNWhisperModule.java CHANGED Viewed

@@ -5,6 +5,7 @@ import android.util.Log;
 import android.os.Build;
 import android.os.Handler;
 import android.os.AsyncTask;
+import android.media.AudioRecord;
 import com.facebook.react.bridge.Promise;
 import com.facebook.react.bridge.ReactApplicationContext;
@@ -51,7 +52,7 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
             throw new Exception("Failed to initialize context");
           }
           int id = Math.abs(new Random().nextInt());
-          WhisperContext whisperContext = new WhisperContext(context);
+          WhisperContext whisperContext = new WhisperContext(id, reactContext, context);
           contexts.put(id, whisperContext);
           return id;
         } catch (Exception e) {
@@ -72,18 +73,27 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
   }
   @ReactMethod
-  public void transcribe(int id, String filePath, ReadableMap options, Promise promise) {
+  public void transcribeFile(int id, int jobId, String filePath, ReadableMap options, Promise promise) {
+    final WhisperContext context = contexts.get(id);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    if (context.isCapturing()) {
+      promise.reject("The context is in realtime transcribe mode");
+      return;
+    }
+    if (context.isTranscribing()) {
+      promise.reject("Context is already transcribing");
+      return;
+    }
     new AsyncTask<Void, Void, WritableMap>() {
       private Exception exception;
       @Override
       protected WritableMap doInBackground(Void... voids) {
         try {
-          WhisperContext context = contexts.get(id);
-          if (context == null) {
-            throw new Exception("Context " + id + " not found");
-          }
-          return context.transcribe(filePath, options);
+          return context.transcribeFile(jobId, filePath, options);
         } catch (Exception e) {
           exception = e;
           return null;
@@ -101,6 +111,35 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
     }.execute();
   }
+  @ReactMethod
+  public void startRealtimeTranscribe(int id, int jobId, ReadableMap options, Promise promise) {
+    final WhisperContext context = contexts.get(id);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    if (context.isCapturing()) {
+      promise.reject("Context is already in capturing");
+      return;
+    }
+    int state = context.startRealtimeTranscribe(jobId, options);
+    if (state == AudioRecord.STATE_INITIALIZED) {
+      promise.resolve(null);
+      return;
+    }
+    promise.reject("Failed to start realtime transcribe. State: " + state);
+  }
+  @ReactMethod
+  public void abortTranscribe(int contextId, int jobId, Promise promise) {
+    WhisperContext context = contexts.get(contextId);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    context.stopTranscribe(jobId);
+  }
   @ReactMethod
   public void releaseContext(int id, Promise promise) {
     new AsyncTask<Void, Void, Void>() {
@@ -168,6 +207,7 @@ public class RNWhisperModule extends ReactContextBaseJavaModule implements Lifec
   @Override
   public void onHostDestroy() {
+    WhisperContext.abortAllTranscribe();
     for (WhisperContext context : contexts.values()) {
       context.release();
     }

package/android/src/main/java/com/rnwhisper/WhisperContext.java CHANGED Viewed

@@ -4,10 +4,15 @@ import com.facebook.react.bridge.Arguments;
 import com.facebook.react.bridge.WritableArray;
 import com.facebook.react.bridge.WritableMap;
 import com.facebook.react.bridge.ReadableMap;
+import com.facebook.react.bridge.ReactApplicationContext;
+import com.facebook.react.modules.core.DeviceEventManagerModule;
 import android.util.Log;
 import android.os.Build;
 import android.content.res.AssetManager;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder.AudioSource;
 import java.util.Random;
 import java.lang.StringBuilder;
@@ -26,16 +31,202 @@ import java.nio.ShortBuffer;
 public class WhisperContext {
   public static final String NAME = "RNWhisperContext";
+  private static final int SAMPLE_RATE = 16000;
+  private static final int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
+  private static final int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
+  private static final int AUDIO_SOURCE = AudioSource.VOICE_RECOGNITION;
+  private static final int DEFAULT_MAX_AUDIO_SEC = 30;
+  private int id;
+  private ReactApplicationContext reactContext;
   private long context;
-  public WhisperContext(long context) {
+  private DeviceEventManagerModule.RCTDeviceEventEmitter eventEmitter;
+  private int jobId = -1;
+  private AudioRecord recorder = null;
+  private int bufferSize;
+  private short[] buffer16;
+  private int nSamples = 0;
+  private int nSamplesTranscribing = 0;
+  private boolean isCapturing = false;
+  private boolean isStoppedByAction = false;
+  private boolean isTranscribing = false;
+  private boolean isRealtime = false;
+  private Thread fullHandler = null;
+  public WhisperContext(int id, ReactApplicationContext reactContext, long context) {
+    this.id = id;
     this.context = context;
+    this.reactContext = reactContext;
+    eventEmitter = reactContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class);
+    bufferSize = AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT);
   }
-  public WritableMap transcribe(final String filePath, final ReadableMap options) throws IOException, Exception {
-    int code = fullTranscribe(
+  public int startRealtimeTranscribe(int jobId, ReadableMap options) {
+    if (isCapturing || isTranscribing) {
+      return -100;
+    }
+    recorder = new AudioRecord(AUDIO_SOURCE, SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT, bufferSize);
+    int state = recorder.getState();
+    if (state != AudioRecord.STATE_INITIALIZED) {
+      recorder.release();
+      return state;
+    }
+    int realtimeAudioSec = options.hasKey("realtimeAudioSec") ? options.getInt("realtimeAudioSec") : 0;
+    final int maxAudioSec = realtimeAudioSec > 0 ? realtimeAudioSec : DEFAULT_MAX_AUDIO_SEC;
+    buffer16 = new short[maxAudioSec * SAMPLE_RATE * Short.BYTES];
+    this.jobId = jobId;
+    isCapturing = true;
+    isStoppedByAction = false;
+    isRealtime = true;
+    nSamples = 0;
+    nSamplesTranscribing = 0;
+    fullHandler = null;
+    recorder.startRecording();
+    new Thread(new Runnable() {
+      @Override
+      public void run() {
+        try {
+          short[] buffer = new short[bufferSize];
+          while (isCapturing) {
+            try {
+              int n = recorder.read(buffer, 0, bufferSize);
+              if (n == 0) continue;
+              if (nSamples + n > maxAudioSec * SAMPLE_RATE) {
+                // Full, stop capturing
+                isCapturing = false;
+                if (!isTranscribing && nSamples == nSamplesTranscribing) {
+                  emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+                } else {
+                  // wait previous handler to finish
+                  fullHandler.join();
+                  fullTranscribeSamples(options, true);
+                }
+                break;
+              }
+              // Append to buffer
+              nSamples += n;
+              for (int i = 0; i < n; i++) {
+                buffer16[nSamples + i] = buffer[i];
+              }
+              fullTranscribeSamples(options, false);
+            } catch (Exception e) {
+              Log.e(NAME, "Error transcribing realtime: " + e.getMessage());
+            }
+          }
+          if (!isTranscribing) {
+            emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", Arguments.createMap());
+          }
+          if (fullHandler != null) {
+            fullHandler.join(); // Wait for full transcribe to finish
+          }
+          recorder.stop();
+        } catch (Exception e) {
+          e.printStackTrace();
+        } finally {
+          recorder.release();
+          recorder = null;
+        }
+      }
+    }).start();
+    return state;
+  }
+  private void fullTranscribeSamples(ReadableMap options, boolean skipCapturingCheck) {
+    if (!isTranscribing && nSamples > SAMPLE_RATE / 2) {
+      isTranscribing = true;
+      fullHandler = new Thread(new Runnable() {
+        @Override
+        public void run() {
+          if (!isCapturing && !skipCapturingCheck) return;
+          nSamplesTranscribing = nSamples;
+          // convert I16 to F32
+          float[] nSamplesBuffer32 = new float[nSamplesTranscribing];
+          for (int i = 0; i < nSamplesTranscribing; i++) {
+            nSamplesBuffer32[i] = buffer16[i] / 32768.0f;
+          }
+          Log.d(NAME, "Start transcribing realtime: " + nSamplesTranscribing);
+          int timeStart = (int) System.currentTimeMillis();
+          int code = full(jobId, options, nSamplesBuffer32, nSamplesTranscribing);
+          int timeEnd = (int) System.currentTimeMillis();
+          int timeRecording = (int) (nSamplesTranscribing / SAMPLE_RATE * 1000);
+          WritableMap payload = Arguments.createMap();
+          payload.putInt("code", code);
+          payload.putInt("processTime", timeEnd - timeStart);
+          payload.putInt("recordingTime", timeRecording);
+          if (code == 0) {
+            payload.putMap("data", getTextSegments());
+          } else {
+            payload.putString("error", "Transcribe failed with code " + code);
+          }
+          if (isStoppedByAction || !isCapturing && nSamplesTranscribing == nSamples) {
+            payload.putBoolean("isCapturing", false);
+            payload.putBoolean("isStoppedByAction", isStoppedByAction);
+            emitTranscribeEvent("@RNWhisper_onRealtimeTranscribeEnd", payload);
+          } else if (code == 0) {
+            payload.putBoolean("isCapturing", true);
+            emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
+          } else {
+            payload.putBoolean("isCapturing", true);
+            emitTranscribeEvent("@RNWhisper_onRealtimeTranscribe", payload);
+          }
+          isTranscribing = false;
+        }
+      });
+      fullHandler.start();
+    }
+  }
+  private void emitTranscribeEvent(final String eventName, final WritableMap payload) {
+    WritableMap event = Arguments.createMap();
+    event.putInt("contextId", WhisperContext.this.id);
+    event.putInt("jobId", jobId);
+    event.putMap("payload", payload);
+    eventEmitter.emit(eventName, event);
+  }
+  public WritableMap transcribeFile(int jobId, String filePath, ReadableMap options) throws IOException, Exception {
+    this.jobId = jobId;
+    isTranscribing = true;
+    float[] audioData = decodeWaveFile(new File(filePath));
+    int code = full(jobId, options, audioData, audioData.length);
+    isTranscribing = false;
+    this.jobId = -1;
+    if (code != 0) {
+      throw new Exception("Failed to transcribe the file. Code: " + code);
+    }
+    return getTextSegments();
+  }
+  private int full(int jobId, ReadableMap options, float[] audioData, int audioDataLen) {
+    return fullTranscribe(
+      jobId,
       context,
-      decodeWaveFile(new File(filePath)),
+      // jboolean realtime,
+      isRealtime,
+      // float[] audio_data,
+      audioData,
+      // jint audio_data_len,
+      audioDataLen,
       // jint n_threads,
       options.hasKey("maxThreads") ? options.getInt("maxThreads") : -1,
       // jint max_context,
@@ -69,9 +260,9 @@ public class WhisperContext {
       // jstring prompt
       options.hasKey("prompt") ? options.getString("prompt") : null
     );
-    if (code != 0) {
-      throw new Exception("Transcription failed with code " + code);
-    }
+  }
+  private WritableMap getTextSegments() {
     Integer count = getTextSegmentCount(context);
     StringBuilder builder = new StringBuilder();
@@ -92,7 +283,28 @@ public class WhisperContext {
     return data;
   }
+  public boolean isCapturing() {
+    return isCapturing;
+  }
+  public boolean isTranscribing() {
+    return isTranscribing;
+  }
+  public void stopTranscribe(int jobId) {
+    abortTranscribe(jobId);
+    isCapturing = false;
+    isTranscribing = false;
+    isStoppedByAction = true;
+  }
+  public void stopCurrentTranscribe() {
+    stopTranscribe(this.jobId);
+  }
   public void release() {
+    stopCurrentTranscribe();
     freeContext(context);
   }
@@ -185,8 +397,11 @@ public class WhisperContext {
   protected static native long initContext(String modelPath);
   protected static native int fullTranscribe(
+    int job_id,
     long context,
+    boolean realtime,
     float[] audio_data,
+    int audio_data_len,
     int n_threads,
     int max_context,
     int word_thold,
@@ -203,6 +418,8 @@ public class WhisperContext {
     String language,
     String prompt
   );
+  protected static native void abortTranscribe(int jobId);
+  protected static native void abortAllTranscribe();
   protected static native int getTextSegmentCount(long context);
   protected static native String getTextSegment(long context, int index);
   protected static native int getTextSegmentT0(long context, int index);

package/android/src/main/jni/whisper/Whisper.mk CHANGED Viewed

@@ -12,7 +12,7 @@ ifneq ($(APP_OPTIM),debug)
 endif
 LOCAL_CFLAGS    += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
-LOCAL_CPPFLAGS  += -std=c++11
+LOCAL_CPPFLAGS  += -std=c++11 -I $(WHISPER_LIB_DIR)
 LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
                    $(WHISPER_LIB_DIR)/whisper.cpp \
                    $(WHISPER_LIB_DIR)/rn-whisper.cpp \

package/android/src/main/jni/whisper/jni.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <cstdlib>
 #include <sys/sysinfo.h>
 #include <string>
+#include <thread>
 #include "whisper.h"
 #include "rn-whisper.h"
 #include "ggml.h"
@@ -36,8 +37,11 @@ JNIEXPORT jint JNICALL
 Java_com_rnwhisper_WhisperContext_fullTranscribe(
     JNIEnv *env,
     jobject thiz,
+    jint job_id,
     jlong context_ptr,
+    jboolean realtime,
     jfloatArray audio_data,
+    jint audio_data_len,
     jint n_threads,
     jint max_context,
     int word_thold,
@@ -57,9 +61,8 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
     UNUSED(thiz);
     struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
     jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr);
-    const jsize audio_data_length = env->GetArrayLength(audio_data);
-    int max_threads = min(4, get_nprocs());
+    int max_threads = min(4, std::thread::hardware_concurrency());
     LOGI("About to create params");
@@ -81,7 +84,7 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
     params.speed_up = speed_up;
     params.offset_ms = 0;
     params.no_context = true;
-    params.single_segment = false;
+    params.single_segment = realtime;
     if (max_len > -1) {
         params.max_len = max_len;
@@ -117,19 +120,45 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe(
         );
     }
+    params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+        bool is_aborted = *(bool*)user_data;
+        return !is_aborted;
+    };
+    params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(job_id);
     LOGI("About to reset timings");
     whisper_reset_timings(context);
     LOGI("About to run whisper_full");
-    int code = whisper_full(context, params, audio_data_arr, audio_data_length);
+    int code = whisper_full(context, params, audio_data_arr, audio_data_len);
     if (code == 0) {
         // whisper_print_timings(context);
     }
     env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);
     env->ReleaseStringUTFChars(language, language_chars);
+    rn_whisper_remove_abort_map(job_id);
     return code;
 }
+JNIEXPORT void JNICALL
+Java_com_rnwhisper_WhisperContext_abortTranscribe(
+    JNIEnv *env,
+    jobject thiz,
+    jint job_id
+) {
+    UNUSED(thiz);
+    rn_whisper_abort_transcribe(job_id);
+}
+JNIEXPORT void JNICALL
+Java_com_rnwhisper_WhisperContext_abortAllTranscribe(
+    JNIEnv *env,
+    jobject thiz
+) {
+    UNUSED(thiz);
+    rn_whisper_abort_all_transcribe();
+}
 JNIEXPORT jint JNICALL
 Java_com_rnwhisper_WhisperContext_getTextSegmentCount(
         JNIEnv *env, jobject thiz, jlong context_ptr) {
@@ -176,4 +205,4 @@ Java_com_rnwhisper_WhisperContext_freeContext(
     whisper_free(context);
 }
-} // extern "C"
+} // extern "C"

package/cpp/rn-whisper.cpp CHANGED Viewed

@@ -1,6 +1,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <unordered_map>
 #include "whisper.h"
 extern "C" {
@@ -28,4 +29,29 @@ void rn_whisper_convert_prompt(
   }
 }
+std::unordered_map<int, bool> abort_map;
+bool* rn_whisper_assign_abort_map(int job_id) {
+  abort_map[job_id] = false;
+  return &abort_map[job_id];
+}
+void rn_whisper_remove_abort_map(int job_id) {
+  if (abort_map.find(job_id) != abort_map.end()) {
+    abort_map.erase(job_id);
+  }
+}
+void rn_whisper_abort_transcribe(int job_id) {
+  if (abort_map.find(job_id) != abort_map.end()) {
+    abort_map[job_id] = true;
+  }
+}
+void rn_whisper_abort_all_transcribe() {
+  for (auto it = abort_map.begin(); it != abort_map.end(); ++it) {
+    it->second = true;
+  }
+}
 }

package/cpp/rn-whisper.h CHANGED Viewed

@@ -11,6 +11,11 @@ void rn_whisper_convert_prompt(
   std::string * prompt
 );
+bool* rn_whisper_assign_abort_map(int job_id);
+void rn_whisper_remove_abort_map(int job_id);
+void rn_whisper_abort_transcribe(int job_id);
+void rn_whisper_abort_all_transcribe();
 #ifdef __cplusplus
 }
 #endif

package/ios/RNWhisper.h CHANGED Viewed

@@ -3,9 +3,9 @@
 #import "rn-whisper.h"
 #endif
 #import <React/RCTBridgeModule.h>
+#import <React/RCTEventEmitter.h>
-@interface RNWhisper : NSObject <RCTBridgeModule>
+@interface RNWhisper : RCTEventEmitter <RCTBridgeModule>
 @end