npm - whisper.rn - Versions diffs - 0.4.0-rc.7 → 0.4.0-rc.9 - Mend

whisper.rn 0.4.0-rc.7 → 0.4.0-rc.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/android/src/main/CMakeLists.txt +2 -1
package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -12
package/android/src/main/java/com/rnwhisper/RNWhisper.java +75 -34
package/android/src/main/java/com/rnwhisper/WhisperContext.java +20 -3
package/android/src/main/jni.cpp +29 -1
package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/cpp/coreml/whisper-encoder.mm +1 -1
package/cpp/ggml-aarch64.c +3209 -0
package/cpp/ggml-aarch64.h +39 -0
package/cpp/ggml-alloc.c +732 -494
package/cpp/ggml-alloc.h +47 -63
package/cpp/ggml-backend-impl.h +162 -47
package/cpp/ggml-backend.cpp +2635 -0
package/cpp/ggml-backend.h +216 -71
package/cpp/ggml-common.h +1853 -0
package/cpp/ggml-cpu-impl.h +614 -0
package/cpp/ggml-impl.h +144 -178
package/cpp/ggml-metal.h +14 -60
package/cpp/ggml-metal.m +3437 -2097
package/cpp/ggml-quants.c +12559 -4189
package/cpp/ggml-quants.h +135 -212
package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.c +9029 -5219
package/cpp/ggml.h +673 -338
package/cpp/rn-whisper.cpp +91 -0
package/cpp/rn-whisper.h +2 -0
package/cpp/whisper.cpp +1476 -675
package/cpp/whisper.h +84 -28
package/ios/RNWhisper.mm +124 -37
package/ios/RNWhisperAudioUtils.h +1 -0
package/ios/RNWhisperAudioUtils.m +20 -13
package/ios/RNWhisperContext.h +3 -2
package/ios/RNWhisperContext.mm +41 -8
package/jest/mock.js +9 -1
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +48 -19
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +48 -19
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +6 -3
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts +25 -3
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/NativeRNWhisper.ts +12 -3
package/src/index.ts +63 -24
package/src/version.json +1 -1
package/whisper-rn.podspec +9 -2
package/cpp/ggml-backend.c +0 -1357
package/cpp/ggml-metal-whisper.metal +0 -4908

package/cpp/whisper.h CHANGED Viewed

@@ -84,9 +84,48 @@ extern "C" {
     typedef int32_t whisper_token;
     typedef int32_t whisper_seq_id;
+    enum whisper_alignment_heads_preset {
+        WHISPER_AHEADS_NONE,
+        WHISPER_AHEADS_N_TOP_MOST,  // All heads from the N-top-most text-layers
+        WHISPER_AHEADS_CUSTOM,
+        WHISPER_AHEADS_TINY_EN,
+        WHISPER_AHEADS_TINY,
+        WHISPER_AHEADS_BASE_EN,
+        WHISPER_AHEADS_BASE,
+        WHISPER_AHEADS_SMALL_EN,
+        WHISPER_AHEADS_SMALL,
+        WHISPER_AHEADS_MEDIUM_EN,
+        WHISPER_AHEADS_MEDIUM,
+        WHISPER_AHEADS_LARGE_V1,
+        WHISPER_AHEADS_LARGE_V2,
+        WHISPER_AHEADS_LARGE_V3,
+        WHISPER_AHEADS_LARGE_V3_TURBO,
+    };
+    typedef struct whisper_ahead {
+        int n_text_layer;
+        int n_head;
+    } whisper_ahead;
+    typedef struct whisper_aheads {
+        size_t n_heads;
+        const whisper_ahead * heads;
+    } whisper_aheads;
     struct whisper_context_params {
         bool  use_gpu;
         bool  use_coreml;
+        bool  flash_attn;
+        int   gpu_device;  // CUDA device
+        // [EXPERIMENTAL] Token-level timestamps with DTW
+        bool dtw_token_timestamps;
+        enum whisper_alignment_heads_preset dtw_aheads_preset;
+        int dtw_n_top;
+        struct whisper_aheads dtw_aheads;
+        size_t dtw_mem_size; // TODO: remove
     };
     typedef struct whisper_token_data {
@@ -103,6 +142,11 @@ extern "C" {
         int64_t t0;        // start time of the token
         int64_t t1;        //   end time of the token
+        // [EXPERIMENTAL] Token-level timestamps with DTW
+        // do not use if you haven't computed token-level timestamps with dtw
+        // Roughly corresponds to the moment in audio in which the token was output
+        int64_t t_dtw;
         float vlen;        // voice length of the token
     } whisper_token_data;
@@ -196,6 +240,13 @@ extern "C" {
     //                     GPU, by caching compiled 'blobs' there.
     //                     Set to nullptr if not used.
     // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
+    WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
+        struct whisper_context * ctx,
+          struct whisper_state * state,
+                    const char * model_path,
+                    const char * device,
+                    const char * cache_dir);
     WHISPER_API int whisper_ctx_init_openvino_encoder(
         struct whisper_context * ctx,
                     const char * model_path,
@@ -224,22 +275,6 @@ extern "C" {
                                int   n_samples,
                                int   n_threads);
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
     // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
     // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
     // n_mel must be 80
@@ -296,7 +331,7 @@ extern "C" {
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns -1 on failure
+    // Returns a negative number on failure - the number of tokens that would have been returned
     // TODO: not sure if correct
     WHISPER_API int whisper_tokenize(
             struct whisper_context * ctx,
@@ -304,8 +339,12 @@ extern "C" {
                      whisper_token * tokens,
                                int   n_max_tokens);
+    // Return the number of tokens in the provided text
+    // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
+    int whisper_token_count(struct whisper_context * ctx, const char * text);
     // Largest language id (i.e. number of available languages - 1)
-    WHISPER_API int whisper_lang_max_id();
+    WHISPER_API int whisper_lang_max_id(void);
     // Return the id of the specified language, returns -1 if not found
     // Examples:
@@ -385,6 +424,24 @@ extern "C" {
     WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
     // Performance information from the default state.
+    struct whisper_timings {
+        int64_t load_us;
+        int64_t t_start_us;
+        int32_t fail_p;
+        int32_t fail_h;
+        int64_t t_mel_us;
+        int32_t n_sample;
+        int32_t n_encode;
+        int32_t n_decode;
+        int32_t n_batchd;
+        int32_t n_prompt;
+        int64_t t_sample_us;
+        int64_t t_encode_us;
+        int64_t t_decode_us;
+        int64_t t_batchd_us;
+        int64_t t_prompt_us;
+    };
+    WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
     WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
     WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
@@ -412,11 +469,6 @@ extern "C" {
     // If it returns false, the computation is aborted
     typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
-    // Abort callback
-    // If not NULL, called before ggml computation
-    // If it returns true, the computation is aborted
-    typedef bool (*whisper_abort_callback)(void * user_data);
     // Logits filter callback
     // Can be used to modify the logits before sampling
     // If not NULL, called after applying temperature to logits
@@ -458,15 +510,19 @@ extern "C" {
         // [EXPERIMENTAL] speed-up techniques
         // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
         bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
         int  audio_ctx;         // overwrite the audio context size (0 = use default)
         // [EXPERIMENTAL] [TDRZ] tinydiarize
         bool tdrz_enable;       // enable tinydiarize speaker turn detection
+        // A regular expression that matches tokens to suppress
+        const char * suppress_regex;
         // tokens to provide to the whisper decoder as initial prompt
         // these are prepended to any existing text context from a previous call
+        // use whisper_tokenize() to convert text to tokens
+        // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
         const char * initial_prompt;
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;
@@ -513,7 +569,7 @@ extern "C" {
         void * encoder_begin_callback_user_data;
         // called each time before ggml computation starts
-        whisper_abort_callback abort_callback;
+        wsp_ggml_abort_callback abort_callback;
         void * abort_callback_user_data;
         // called by each decoder to filter obtained logits
@@ -527,10 +583,10 @@ extern "C" {
     };
     // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
-    WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
-    WHISPER_API struct whisper_context_params whisper_context_default_params(void);
+    WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
+    WHISPER_API struct whisper_context_params   whisper_context_default_params       (void);
     WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
-    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
+    WHISPER_API struct whisper_full_params   whisper_full_default_params       (enum whisper_sampling_strategy strategy);
     // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
     // Not thread safe for same context

package/ios/RNWhisper.mm CHANGED Viewed

@@ -50,6 +50,7 @@ RCT_REMAP_METHOD(initContext,
     BOOL isBundleAsset = [[modelOptions objectForKey:@"isBundleAsset"] boolValue];
     BOOL useGpu = [[modelOptions objectForKey:@"useGpu"] boolValue];
     BOOL useCoreMLIos = [[modelOptions objectForKey:@"useCoreMLIos"] boolValue];
+    BOOL useFlashAttn = [[modelOptions objectForKey:@"useFlashAttn"] boolValue];
     // For support debug assets in development mode
     BOOL downloadCoreMLAssets = [[modelOptions objectForKey:@"downloadCoreMLAssets"] boolValue];
@@ -79,6 +80,7 @@ RCT_REMAP_METHOD(initContext,
         contextId:contextId
         noCoreML:!useCoreMLIos
         noMetal:!useGpu
+        useFlashAttn:useFlashAttn
     ];
     if ([context getContext] == NULL) {
         reject(@"whisper_cpp_error", @"Failed to load the model", nil);
@@ -103,42 +105,17 @@ RCT_REMAP_METHOD(initContext,
   ];
 }
-RCT_REMAP_METHOD(transcribeFile,
-                 withContextId:(int)contextId
-                 withJobId:(int)jobId
-                 withWaveFile:(NSString *)waveFilePath
-                 withOptions:(NSDictionary *)options
-                 withResolver:(RCTPromiseResolveBlock)resolve
-                 withRejecter:(RCTPromiseRejectBlock)reject)
+- (void)transcribeData:(RNWhisperContext *)context
+    withContextId:(int)contextId
+    withJobId:(int)jobId
+    withData:(float *)data
+    withDataCount:(int)count
+    withOptions:(NSDictionary *)options
+    withResolver:(RCTPromiseResolveBlock)resolve
+    withRejecter:(RCTPromiseRejectBlock)reject
 {
-    RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
-    if (context == nil) {
-        reject(@"whisper_error", @"Context not found", nil);
-        return;
-    }
-    if ([context isCapturing]) {
-        reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
-        return;
-    }
-    if ([context isTranscribing]) {
-        reject(@"whisper_error", @"Context is already transcribing", nil);
-        return;
-    }
-    NSString *path = waveFilePath;
-    if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
-        path = [RNWhisperDownloader downloadFile:path toFile:nil];
-    }
-    int count = 0;
-    float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
-    if (waveFile == nil) {
-        reject(@"whisper_error", @"Invalid file", nil);
-        return;
-    }
-    [context transcribeFile:jobId
-        audioData:waveFile
+    [context transcribeData:jobId
+        audioData:data
         audioDataCount:count
         options:options
         onProgress: ^(int progress) {
@@ -171,11 +148,9 @@ RCT_REMAP_METHOD(transcribeFile,
         }
         onEnd: ^(int code) {
             if (code != 0 && code != 999) {
-                free(waveFile);
                 reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
                 return;
             }
-            free(waveFile);
             NSMutableDictionary *result = [context getTextSegments];
             result[@"isAborted"] = @([context isStoppedByAction]);
             resolve(result);
@@ -183,6 +158,99 @@ RCT_REMAP_METHOD(transcribeFile,
     ];
 }
+RCT_REMAP_METHOD(transcribeFile,
+                 withContextId:(int)contextId
+                 withJobId:(int)jobId
+                 withWaveFile:(NSString *)waveFilePathOrDataBase64
+                 withOptions:(NSDictionary *)options
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
+    if (context == nil) {
+        reject(@"whisper_error", @"Context not found", nil);
+        return;
+    }
+    if ([context isCapturing]) {
+        reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
+        return;
+    }
+    if ([context isTranscribing]) {
+        reject(@"whisper_error", @"Context is already transcribing", nil);
+        return;
+    }
+    float *data = nil;
+    int count = 0;
+    if ([waveFilePathOrDataBase64 hasPrefix:@"http://"] || [waveFilePathOrDataBase64 hasPrefix:@"https://"]) {
+        NSString *path = [RNWhisperDownloader downloadFile:waveFilePathOrDataBase64 toFile:nil];
+        data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
+    } else if ([waveFilePathOrDataBase64 hasPrefix:@"data:audio/wav;base64,"]) {
+        NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[waveFilePathOrDataBase64 substringFromIndex:22] options:0];
+        data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
+    } else {
+        data = [RNWhisperAudioUtils decodeWaveFile:waveFilePathOrDataBase64 count:&count];
+    }
+    if (data == nil) {
+        reject(@"whisper_error", @"Invalid file", nil);
+        return;
+    }
+    [self transcribeData:context
+        withContextId:contextId
+        withJobId:jobId
+        withData:data
+        withDataCount:count
+        withOptions:options
+        withResolver:resolve
+        withRejecter:reject
+    ];
+}
+RCT_REMAP_METHOD(transcribeData,
+                 withContextId:(int)contextId
+                 withJobId:(int)jobId
+                 withData:(NSString *)dataBase64 // pcm data base64 encoded
+                 withOptions:(NSDictionary *)options
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+  RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
+  if (context == nil) {
+      reject(@"whisper_error", @"Context not found", nil);
+      return;
+  }
+  if ([context isCapturing]) {
+      reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
+      return;
+  }
+  if ([context isTranscribing]) {
+      reject(@"whisper_error", @"Context is already transcribing", nil);
+      return;
+  }
+  NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:dataBase64 options:0];
+  int count = 0;
+  float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
+  if (data == nil) {
+      reject(@"whisper_error", @"Invalid data", nil);
+      return;
+  }
+  [self transcribeData:context
+      withContextId:contextId
+      withJobId:jobId
+      withData:data
+      withDataCount:count
+      withOptions:options
+      withResolver:resolve
+      withRejecter:reject
+  ];
+}
 RCT_REMAP_METHOD(startRealtimeTranscribe,
                  withContextId:(int)contextId
                  withJobId:(int)jobId
@@ -244,6 +312,25 @@ RCT_REMAP_METHOD(abortTranscribe,
     resolve(nil);
 }
+RCT_REMAP_METHOD(bench,
+                 withContextId:(int)contextId
+                 withMaxThreads:(int)maxThreads
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
+    if (context == nil) {
+        reject(@"whisper_error", @"Context not found", nil);
+        return;
+    }
+    if ([context isTranscribing]) {
+        reject(@"whisper_error", @"The context is transcribing", nil);
+        return;
+    }
+    NSString *result = [context bench:maxThreads];
+    resolve(result);
+}
 RCT_REMAP_METHOD(releaseContext,
                  withContextId:(int)contextId
                  withResolver:(RCTPromiseResolveBlock)resolve

package/ios/RNWhisperAudioUtils.h CHANGED Viewed

@@ -2,6 +2,7 @@
 @interface RNWhisperAudioUtils : NSObject
++ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader;
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
 @end

package/ios/RNWhisperAudioUtils.m CHANGED Viewed

@@ -3,25 +3,32 @@
 @implementation RNWhisperAudioUtils
++ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader {
+  NSData *waveData = data;
+  if (cutHeader) {
+    // just cut 44 bytes from the beginning
+    waveData = [data subdataWithRange:NSMakeRange(44, [data length]-44)];
+  }
+  const short *shortArray = (const short *)[waveData bytes];
+  int shortCount = (int) ([waveData length] / sizeof(short));
+  float *floatArray = (float *) malloc(shortCount * sizeof(float));
+  for (NSInteger i = 0; i < shortCount; i++) {
+      float floatValue = ((float)shortArray[i]) / 32767.0;
+      floatValue = MAX(floatValue, -1.0);
+      floatValue = MIN(floatValue, 1.0);
+      floatArray[i] = floatValue;
+  }
+  *count = shortCount;
+  return floatArray;
+}
 + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
     NSURL *url = [NSURL fileURLWithPath:filePath];
     NSData *fileData = [NSData dataWithContentsOfURL:url];
     if (fileData == nil) {
         return nil;
     }
-    NSMutableData *waveData = [[NSMutableData alloc] init];
-    [waveData appendData:[fileData subdataWithRange:NSMakeRange(44, [fileData length]-44)]];
-    const short *shortArray = (const short *)[waveData bytes];
-    int shortCount = (int) ([waveData length] / sizeof(short));
-    float *floatArray = (float *) malloc(shortCount * sizeof(float));
-    for (NSInteger i = 0; i < shortCount; i++) {
-        float floatValue = ((float)shortArray[i]) / 32767.0;
-        floatValue = MAX(floatValue, -1.0);
-        floatValue = MIN(floatValue, 1.0);
-        floatArray[i] = floatValue;
-    }
-    *count = shortCount;
-    return floatArray;
+    return [RNWhisperAudioUtils decodeWaveData:fileData count:count cutHeader:YES];
 }
 @end

package/ios/RNWhisperContext.h CHANGED Viewed

@@ -42,7 +42,7 @@ typedef struct {
     bool isMetalEnabled;
 }
-+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML noMetal:(BOOL)noMetal;
++ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML noMetal:(BOOL)noMetal useFlashAttn:(BOOL)useFlashAttn;
 - (bool)isMetalEnabled;
 - (NSString *)reasonNoMetal;
 - (struct whisper_context *)getContext;
@@ -50,7 +50,7 @@ typedef struct {
 - (OSStatus)transcribeRealtime:(int)jobId
     options:(NSDictionary *)options
     onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe;
-- (void)transcribeFile:(int)jobId
+- (void)transcribeData:(int)jobId
     audioData:(float *)audioData
     audioDataCount:(int)audioDataCount
     options:(NSDictionary *)options
@@ -63,6 +63,7 @@ typedef struct {
 - (bool)isTranscribing;
 - (bool)isStoppedByAction;
 - (NSMutableDictionary *)getTextSegments;
+- (NSString *)bench:(int)maxThreads;
 - (void)invalidate;
 @end

package/ios/RNWhisperContext.mm CHANGED Viewed

@@ -10,12 +10,17 @@
     contextId:(int)contextId
     noCoreML:(BOOL)noCoreML
     noMetal:(BOOL)noMetal
+    useFlashAttn:(BOOL)useFlashAttn
 {
     RNWhisperContext *context = [[RNWhisperContext alloc] init];
     context->contextId = contextId;
     struct whisper_context_params cparams;
     NSString *reasonNoMetal = @"";
     cparams.use_gpu = !noMetal;
+    cparams.flash_attn = useFlashAttn;
+    // TODO: Figure out why it leads to re-init crash
+    cparams.dtw_token_timestamps = false;
     cparams.use_coreml = !noCoreML;
 #ifndef WHISPER_USE_COREML
@@ -116,6 +121,7 @@
     self->recordState.transcribeSliceIndex = 0;
     self->recordState.nSamplesTranscribing = 0;
+    self->recordState.sliceNSamples.clear();
     self->recordState.sliceNSamples.push_back(0);
     self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
@@ -202,7 +208,7 @@ void AudioInputCallback(void * inUserData,
         state->sliceNSamples.push_back(0);
     }
-    NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
+    NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
     state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
@@ -352,9 +358,10 @@ void AudioInputCallback(void * inUserData,
 struct rnwhisper_segments_callback_data {
     void (^onNewSegments)(NSDictionary *);
     int total_n_new;
+    bool tdrzEnable;
 };
-- (void)transcribeFile:(int)jobId
+- (void)transcribeData:(int)jobId
     audioData:(float *)audioData
     audioDataCount:(int)audioDataCount
     options:(NSDictionary *)options
@@ -385,12 +392,18 @@ struct rnwhisper_segments_callback_data {
                 NSMutableArray *segments = [[NSMutableArray alloc] init];
                 for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
                     const char * text_cur = whisper_full_get_segment_text(ctx, i);
-                    text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+                    NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
+                    if (data->tdrzEnable && whisper_full_get_segment_speaker_turn_next(ctx, i)) {
+                        [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
+                    }
+                    text = [text stringByAppendingString:mutable_ns_text];
                     const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                     const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                     NSDictionary *segment = @{
-                        @"text": [NSString stringWithUTF8String:text_cur],
+                        @"text": [NSString stringWithString:mutable_ns_text],
                         @"t0": [NSNumber numberWithLongLong:t0],
                         @"t1": [NSNumber numberWithLongLong:t1]
                     };
@@ -408,7 +421,8 @@ struct rnwhisper_segments_callback_data {
             };
             struct rnwhisper_segments_callback_data user_data = {
                 .onNewSegments = onNewSegments,
-                .total_n_new = 0
+                .tdrzEnable = options[@"tdrzEnable"] && [options[@"tdrzEnable"] boolValue],
+                .total_n_new = 0,
             };
             params.new_segment_callback_user_data = &user_data;
         }
@@ -468,7 +482,6 @@ struct rnwhisper_segments_callback_data {
     params.print_progress   = false;
     params.print_timestamps = false;
     params.print_special    = false;
-    params.speed_up         = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
     params.translate        = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
     params.language         = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
     params.n_threads        = n_threads > 0 ? n_threads : default_n_threads;
@@ -480,6 +493,7 @@ struct rnwhisper_segments_callback_data {
         params.max_len = [options[@"maxLen"] intValue];
     }
     params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
+    params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false;
     if (options[@"bestOf"] != nil) {
         params.greedy.best_of = [options[@"bestOf"] intValue];
@@ -529,12 +543,21 @@ struct rnwhisper_segments_callback_data {
     NSMutableArray *segments = [[NSMutableArray alloc] init];
     for (int i = 0; i < n_segments; i++) {
         const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
-        text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
+        // Simplified condition
+        if (self->recordState.options[@"tdrzEnable"] &&
+            [self->recordState.options[@"tdrzEnable"] boolValue] &&
+            whisper_full_get_segment_speaker_turn_next(self->ctx, i)) {
+            [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
+        }
+        text = [text stringByAppendingString:mutable_ns_text];
         const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
         const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
         NSDictionary *segment = @{
-            @"text": [NSString stringWithUTF8String:text_cur],
+            @"text": [NSString stringWithString:mutable_ns_text],
             @"t0": [NSNumber numberWithLongLong:t0],
             @"t1": [NSNumber numberWithLongLong:t1]
         };
@@ -546,6 +569,16 @@ struct rnwhisper_segments_callback_data {
     return result;
 }
+- (NSString *)bench:(int)maxThreads {
+    const int n_threads = maxThreads > 0 ? maxThreads : 0;
+    const int max_threads = (int) [[NSProcessInfo processInfo] processorCount];
+    // Use 2 threads by default on 4-core devices, 4 threads on more cores
+    const int default_n_threads = max_threads == 4 ? 2 : MIN(4, max_threads);
+    NSString *result = [NSString stringWithUTF8String:rnwhisper::bench(self->ctx, n_threads).c_str()];
+    return result;
+}
 - (void)invalidate {
     [self stopCurrentTranscribe];
     whisper_free(self->ctx);

package/jest/mock.js CHANGED Viewed

@@ -45,11 +45,19 @@ if (!NativeModules.RNWhisper) {
         })
       })
     }),
+    bench: jest.fn(() => Promise.resolve({
+      config: 'NEON',
+      nThreads: 1,
+      encodeMs: 1,
+      decodeMs: 1,
+      batchMs: 1,
+      promptMs: 1,
+    })),
     releaseContext: jest.fn(() => Promise.resolve()),
     releaseAllContexts: jest.fn(() => Promise.resolve()),
     // iOS AudioSession utils
-    getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
+    getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
       category: 'AVAudioSessionCategoryPlayAndRecord',
       options: [],
     })),

package/lib/commonjs/NativeRNWhisper.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,~~GAiGnCC~~,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
1	+ {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0GnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}