npm - whisper.rn - Versions diffs - 0.4.0-rc.8 → 0.4.0-rc.9 - Mend

whisper.rn 0.4.0-rc.8 → 0.4.0-rc.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/android/src/main/CMakeLists.txt +2 -1
package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
package/android/src/main/java/com/rnwhisper/WhisperContext.java +20 -3
package/android/src/main/jni.cpp +29 -1
package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/cpp/ggml-aarch64.c +3209 -0
package/cpp/ggml-aarch64.h +39 -0
package/cpp/ggml-alloc.c +725 -517
package/cpp/ggml-alloc.h +47 -65
package/cpp/ggml-backend-impl.h +166 -55
package/cpp/ggml-backend.cpp +2635 -0
package/cpp/ggml-backend.h +202 -85
package/cpp/ggml-common.h +1853 -0
package/cpp/ggml-cpu-impl.h +614 -0
package/cpp/ggml-impl.h +143 -180
package/cpp/ggml-metal.h +13 -11
package/cpp/ggml-metal.m +2955 -1632
package/cpp/ggml-quants.c +9824 -3263
package/cpp/ggml-quants.h +133 -248
package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.c +8482 -5142
package/cpp/ggml.h +633 -349
package/cpp/rn-whisper.cpp +91 -0
package/cpp/rn-whisper.h +2 -0
package/cpp/whisper.cpp +1427 -658
package/cpp/whisper.h +84 -28
package/ios/RNWhisper.mm +124 -37
package/ios/RNWhisperAudioUtils.h +1 -0
package/ios/RNWhisperAudioUtils.m +20 -13
package/ios/RNWhisperContext.h +3 -2
package/ios/RNWhisperContext.mm +39 -7
package/jest/mock.js +9 -1
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +48 -19
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +48 -19
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +6 -3
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts +25 -3
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/NativeRNWhisper.ts +12 -3
package/src/index.ts +63 -24
package/src/version.json +1 -1
package/whisper-rn.podspec +9 -2
package/cpp/ggml-backend.c +0 -1718
package/cpp/ggml-metal-whisper.metal +0 -5820

package/android/src/main/CMakeLists.txt CHANGED Viewed

@@ -9,8 +9,9 @@ set(
     SOURCE_FILES
     ${RNWHISPER_LIB_DIR}/ggml.c
     ${RNWHISPER_LIB_DIR}/ggml-alloc.c
-    ${RNWHISPER_LIB_DIR}/ggml-backend.c
+    ${RNWHISPER_LIB_DIR}/ggml-backend.cpp
     ${RNWHISPER_LIB_DIR}/ggml-quants.c
+    ${RNWHISPER_LIB_DIR}/ggml-aarch64.c
     ${RNWHISPER_LIB_DIR}/whisper.cpp
     ${RNWHISPER_LIB_DIR}/rn-audioutils.cpp
     ${RNWHISPER_LIB_DIR}/rn-whisper.cpp

package/android/src/main/java/com/rnwhisper/AudioUtils.java CHANGED Viewed

@@ -2,8 +2,6 @@ package com.rnwhisper;
 import android.util.Log;
-import java.io.IOException;
-import java.io.FileReader;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
@@ -11,23 +9,22 @@ import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.ShortBuffer;
+import java.util.Base64;
+import java.util.Arrays;
 public class AudioUtils {
   private static final String NAME = "RNWhisperAudioUtils";
-  public static float[] decodeWaveFile(InputStream inputStream) throws IOException {
-    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-    byte[] buffer = new byte[1024];
-    int bytesRead;
-    while ((bytesRead = inputStream.read(buffer)) != -1) {
-      baos.write(buffer, 0, bytesRead);
-    }
-    ByteBuffer byteBuffer = ByteBuffer.wrap(baos.toByteArray());
+  private static float[] bufferToFloatArray(byte[] buffer, Boolean cutHeader) {
+    ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
     byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
-    byteBuffer.position(44);
     ShortBuffer shortBuffer = byteBuffer.asShortBuffer();
     short[] shortArray = new short[shortBuffer.limit()];
     shortBuffer.get(shortArray);
+    if (cutHeader) {
+      shortArray = Arrays.copyOfRange(shortArray, 44, shortArray.length);
+    }
     float[] floatArray = new float[shortArray.length];
     for (int i = 0; i < shortArray.length; i++) {
       floatArray[i] = ((float) shortArray[i]) / 32767.0f;
@@ -36,4 +33,22 @@ public class AudioUtils {
     }
     return floatArray;
   }
-}
+  public static float[] decodeWaveFile(InputStream inputStream) throws IOException {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    byte[] buffer = new byte[1024];
+    int bytesRead;
+    while ((bytesRead = inputStream.read(buffer)) != -1) {
+      baos.write(buffer, 0, bytesRead);
+    }
+    return bufferToFloatArray(baos.toByteArray(), true);
+  }
+  public static float[] decodeWaveData(String dataBase64) throws IOException {
+    return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), true);
+  }
+  public static float[] decodePcmData(String dataBase64) {
+    return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), false);
+  }
+}

package/android/src/main/java/com/rnwhisper/RNWhisper.java CHANGED Viewed

@@ -19,6 +19,7 @@ import java.util.HashMap;
 import java.util.Random;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.InputStream;
 import java.io.PushbackInputStream;
 public class RNWhisper implements LifecycleEventListener {
@@ -119,44 +120,16 @@ public class RNWhisper implements LifecycleEventListener {
     tasks.put(task, "initContext");
   }
-  public void transcribeFile(double id, double jobId, String filePath, ReadableMap options, Promise promise) {
-    final WhisperContext context = contexts.get((int) id);
-    if (context == null) {
-      promise.reject("Context not found");
-      return;
-    }
-    if (context.isCapturing()) {
-      promise.reject("The context is in realtime transcribe mode");
-      return;
-    }
-    if (context.isTranscribing()) {
-      promise.reject("Context is already transcribing");
-      return;
-    }
+  private AsyncTask transcribe(WhisperContext context, double jobId, final float[] audioData, final ReadableMap options, Promise promise) {
     AsyncTask task = new AsyncTask<Void, Void, WritableMap>() {
       private Exception exception;
       @Override
       protected WritableMap doInBackground(Void... voids) {
         try {
-          String waveFilePath = filePath;
-          if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
-            waveFilePath = downloader.downloadFile(filePath);
-          }
-          int resId = getResourceIdentifier(waveFilePath);
-          if (resId > 0) {
-            return context.transcribeInputStream(
-              (int) jobId,
-              reactContext.getResources().openRawResource(resId),
-              options
-            );
-          }
-          return context.transcribeInputStream(
+          return context.transcribe(
             (int) jobId,
-            new FileInputStream(new File(waveFilePath)),
+            audioData,
             options
           );
         } catch (Exception e) {
@@ -175,7 +148,66 @@ public class RNWhisper implements LifecycleEventListener {
         tasks.remove(this);
       }
     }.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
-    tasks.put(task, "transcribeFile-" + id);
+    return task;
+  }
+  public void transcribeFile(double id, double jobId, String filePathOrBase64, ReadableMap options, Promise promise) {
+    final WhisperContext context = contexts.get((int) id);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    if (context.isCapturing()) {
+      promise.reject("The context is in realtime transcribe mode");
+      return;
+    }
+    if (context.isTranscribing()) {
+      promise.reject("Context is already transcribing");
+      return;
+    }
+    String waveFilePath = filePathOrBase64;
+    try {
+      if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) {
+        waveFilePath = downloader.downloadFile(filePathOrBase64);
+      }
+      float[] audioData;
+      int resId = getResourceIdentifier(waveFilePath);
+      if (resId > 0) {
+        audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId));
+      } else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) {
+        audioData = AudioUtils.decodeWaveData(filePathOrBase64);
+      } else {
+        audioData = AudioUtils.decodeWaveFile(new FileInputStream(new File(waveFilePath)));
+      }
+      AsyncTask task = transcribe(context, jobId, audioData, options, promise);
+      tasks.put(task, "transcribeFile-" + id);
+    } catch (Exception e) {
+      promise.reject(e);
+    }
+  }
+  public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
+    final WhisperContext context = contexts.get((int) id);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    if (context.isCapturing()) {
+      promise.reject("The context is in realtime transcribe mode");
+      return;
+    }
+    if (context.isTranscribing()) {
+      promise.reject("Context is already transcribing");
+      return;
+    }
+    float[] audioData = AudioUtils.decodePcmData(dataBase64);
+    AsyncTask task = transcribe(context, jobId, audioData, options, promise);
+    tasks.put(task, "transcribeData-" + id);
   }
   public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
@@ -211,7 +243,7 @@ public class RNWhisper implements LifecycleEventListener {
           context.stopTranscribe((int) jobId);
           AsyncTask completionTask = null;
           for (AsyncTask task : tasks.keySet()) {
-            if (tasks.get(task).equals("transcribeFile-" + id)) {
+            if (tasks.get(task).equals("transcribeFile-" + id) || tasks.get(task).equals("transcribeData-" + id)) {
               task.get();
               break;
             }
@@ -235,6 +267,15 @@ public class RNWhisper implements LifecycleEventListener {
     tasks.put(task, "abortTranscribe-" + id);
   }
+  public void bench(double id, double nThreads, Promise promise) {
+    final WhisperContext context = contexts.get((int) id);
+    if (context == null) {
+      promise.reject("Context not found");
+      return;
+    }
+    promise.resolve(context.bench((int) nThreads));
+  }
   public void releaseContext(double id, Promise promise) {
     final int contextId = (int) id;
     AsyncTask task = new AsyncTask<Void, Void, Void>() {
@@ -250,7 +291,7 @@ public class RNWhisper implements LifecycleEventListener {
           context.stopCurrentTranscribe();
           AsyncTask completionTask = null;
           for (AsyncTask task : tasks.keySet()) {
-            if (tasks.get(task).equals("transcribeFile-" + contextId)) {
+            if (tasks.get(task).equals("transcribeFile-" + contextId) || tasks.get(task).equals("transcribeData-" + contextId)) {
               task.get();
               break;
             }

package/android/src/main/java/com/rnwhisper/WhisperContext.java CHANGED Viewed

@@ -53,6 +53,7 @@ public class WhisperContext {
   private boolean isCapturing = false;
   private boolean isStoppedByAction = false;
   private boolean isTranscribing = false;
+  private boolean isTdrzEnable = false;
   private Thread rootFullHandler = null;
   private Thread fullHandler = null;
@@ -73,6 +74,7 @@ public class WhisperContext {
     isCapturing = false;
     isStoppedByAction = false;
     isTranscribing = false;
+    isTdrzEnable = false;
     rootFullHandler = null;
     fullHandler = null;
   }
@@ -113,6 +115,8 @@ public class WhisperContext {
     double realtimeAudioMinSec = options.hasKey("realtimeAudioMinSec") ? options.getDouble("realtimeAudioMinSec") : 0;
     final double audioMinSec = realtimeAudioMinSec > 0.5 && realtimeAudioMinSec <= audioSliceSec ? realtimeAudioMinSec : 1;
+    this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable");
     createRealtimeTranscribeJob(jobId, context, options);
     sliceNSamples = new ArrayList<Integer>();
@@ -328,15 +332,15 @@ public class WhisperContext {
     }
   }
-  public WritableMap transcribeInputStream(int jobId, InputStream inputStream, ReadableMap options) throws IOException, Exception {
+  public WritableMap transcribe(int jobId, float[] audioData, ReadableMap options) throws IOException, Exception {
     if (isCapturing || isTranscribing) {
       throw new Exception("Context is already in capturing or transcribing");
     }
     rewind();
     this.jobId = jobId;
+    this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable");
     isTranscribing = true;
-    float[] audioData = AudioUtils.decodeWaveFile(inputStream);
     boolean hasProgressCallback = options.hasKey("onProgress") && options.getBoolean("onProgress");
     boolean hasNewSegmentsCallback = options.hasKey("onNewSegments") && options.getBoolean("onNewSegments");
@@ -368,8 +372,15 @@ public class WhisperContext {
     WritableMap data = Arguments.createMap();
     WritableArray segments = Arguments.createArray();
     for (int i = 0; i < count; i++) {
       String text = getTextSegment(context, i);
+      // If tdrzEnable is enabled and speaker turn is detected
+      if (this.isTdrzEnable && getTextSegmentSpeakerTurnNext(context, i)) {
+          text += " [SPEAKER_TURN]";
+      }
       builder.append(text);
       WritableMap segment = Arguments.createMap();
@@ -411,6 +422,10 @@ public class WhisperContext {
     stopTranscribe(this.jobId);
   }
+  public String bench(int n_threads) {
+    return bench(context, n_threads);
+  }
   public void release() {
     stopCurrentTranscribe();
     freeContext(context);
@@ -499,6 +514,7 @@ public class WhisperContext {
   protected static native String getTextSegment(long context, int index);
   protected static native int getTextSegmentT0(long context, int index);
   protected static native int getTextSegmentT1(long context, int index);
+  protected static native boolean getTextSegmentSpeakerTurnNext(long context, int index);
   protected static native void createRealtimeTranscribeJob(
     int job_id,
@@ -514,4 +530,5 @@ public class WhisperContext {
     int slice_index,
     int n_samples
   );
+  protected static native String bench(long context, int n_threads);
 }

package/android/src/main/jni.cpp CHANGED Viewed

@@ -155,6 +155,8 @@ Java_com_rnwhisper_WhisperContext_initContext(
         JNIEnv *env, jobject thiz, jstring model_path_str) {
     UNUSED(thiz);
     struct whisper_context_params cparams;
+    cparams.dtw_token_timestamps = false;
     struct whisper_context *context = nullptr;
     const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
     context = whisper_init_from_file_with_params(model_path_chars, cparams);
@@ -171,6 +173,8 @@ Java_com_rnwhisper_WhisperContext_initContextWithAsset(
 ) {
     UNUSED(thiz);
     struct whisper_context_params cparams;
+    cparams.dtw_token_timestamps = false;
     struct whisper_context *context = nullptr;
     const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
     context = whisper_init_from_asset(env, asset_manager, model_path_chars, cparams);
@@ -186,6 +190,8 @@ Java_com_rnwhisper_WhisperContext_initContextWithInputStream(
 ) {
     UNUSED(thiz);
     struct whisper_context_params cparams;
+    cparams.dtw_token_timestamps = false;
     struct whisper_context *context = nullptr;
     context = whisper_init_from_input_stream(env, input_stream, cparams);
     return reinterpret_cast<jlong>(context);
@@ -206,8 +212,8 @@ struct whisper_full_params createFullParams(JNIEnv *env, jobject options) {
     int n_threads = readablemap::getInt(env, options, "maxThreads", default_n_threads);
     params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
     params.translate = readablemap::getBool(env, options, "translate", false);
-    params.speed_up = readablemap::getBool(env, options, "speedUp", false);
     params.token_timestamps = readablemap::getBool(env, options, "tokenTimestamps", false);
+    params.tdrz_enable = readablemap::getBool(env, options, "tdrzEnable", false);
     params.offset_ms = 0;
     params.no_context = true;
     params.single_segment = false;
@@ -493,4 +499,26 @@ Java_com_rnwhisper_WhisperContext_freeContext(
     whisper_free(context);
 }
+JNIEXPORT jboolean JNICALL
+Java_com_rnwhisper_WhisperContext_getTextSegmentSpeakerTurnNext(
+        JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
+    UNUSED(env);
+    UNUSED(thiz);
+    struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
+    return whisper_full_get_segment_speaker_turn_next(context, index);
+}
+JNIEXPORT jstring JNICALL
+Java_com_rnwhisper_WhisperContext_bench(
+    JNIEnv *env,
+    jobject thiz,
+    jlong context_ptr,
+    jint n_threads
+) {
+    UNUSED(thiz);
+    struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
+    std::string result = rnwhisper::bench(context, n_threads);
+    return env->NewStringUTF(result.c_str());
+}
 } // extern "C"

package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java CHANGED Viewed

@@ -47,6 +47,11 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
     rnwhisper.transcribeFile(id, jobId, filePath, options, promise);
   }
+  @ReactMethod
+  public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
+    rnwhisper.transcribeData(id, jobId, dataBase64, options, promise);
+  }
   @ReactMethod
   public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
     rnwhisper.startRealtimeTranscribe(id, jobId, options, promise);
@@ -57,6 +62,11 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
     rnwhisper.abortTranscribe(contextId, jobId, promise);
   }
+  @ReactMethod
+  public void bench(double id, double nThreads, Promise promise) {
+    rnwhisper.bench(id, nThreads, promise);
+  }
   @ReactMethod
   public void releaseContext(double id, Promise promise) {
     rnwhisper.releaseContext(id, promise);

package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java CHANGED Viewed

@@ -47,6 +47,11 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
     rnwhisper.transcribeFile(id, jobId, filePath, options, promise);
   }
+  @ReactMethod
+  public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
+    rnwhisper.transcribeData(id, jobId, dataBase64, options, promise);
+  }
   @ReactMethod
   public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
     rnwhisper.startRealtimeTranscribe(id, jobId, options, promise);
@@ -57,6 +62,11 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
     rnwhisper.abortTranscribe(contextId, jobId, promise);
   }
+  @ReactMethod
+  public void bench(double id, double nThreads, Promise promise) {
+    rnwhisper.bench(id, nThreads, promise);
+  }
   @ReactMethod
   public void releaseContext(double id, Promise promise) {
     rnwhisper.releaseContext(id, promise);