whisper.rn 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +69 -0
  2. package/android/src/main/java/com/rnwhisper/RNWhisper.java +211 -0
  3. package/android/src/main/java/com/rnwhisper/WhisperContext.java +34 -4
  4. package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +157 -0
  5. package/android/src/main/jni.cpp +196 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
  12. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  13. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  14. package/ios/RNWhisper.mm +147 -0
  15. package/ios/RNWhisperContext.mm +18 -24
  16. package/ios/RNWhisperVadContext.h +29 -0
  17. package/ios/RNWhisperVadContext.mm +152 -0
  18. package/jest/mock.js +19 -0
  19. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  20. package/lib/commonjs/index.js +111 -1
  21. package/lib/commonjs/index.js.map +1 -1
  22. package/lib/module/NativeRNWhisper.js.map +1 -1
  23. package/lib/module/index.js +112 -0
  24. package/lib/module/index.js.map +1 -1
  25. package/lib/typescript/NativeRNWhisper.d.ts +35 -0
  26. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  27. package/lib/typescript/index.d.ts +39 -3
  28. package/lib/typescript/index.d.ts.map +1 -1
  29. package/package.json +1 -1
  30. package/src/NativeRNWhisper.ts +48 -0
  31. package/src/index.ts +132 -1
@@ -148,6 +148,47 @@ static struct whisper_context *whisper_init_from_asset(
148
148
  return whisper_init_with_params(&loader, cparams);
149
149
  }
150
150
 
151
+ // VAD context initialization functions
152
+ static struct whisper_vad_context *whisper_vad_init_from_input_stream(
153
+ JNIEnv *env,
154
+ jobject input_stream, // PushbackInputStream
155
+ struct whisper_vad_context_params vad_params
156
+ ) {
157
+ input_stream_context *context = new input_stream_context;
158
+ context->env = env;
159
+ context->input_stream = env->NewGlobalRef(input_stream);
160
+
161
+ whisper_model_loader loader = {
162
+ .context = context,
163
+ .read = &input_stream_read,
164
+ .eof = &input_stream_is_eof,
165
+ .close = &input_stream_close
166
+ };
167
+ return whisper_vad_init_with_params(&loader, vad_params);
168
+ }
169
+
170
+ static struct whisper_vad_context *whisper_vad_init_from_asset(
171
+ JNIEnv *env,
172
+ jobject assetManager,
173
+ const char *asset_path,
174
+ struct whisper_vad_context_params vad_params
175
+ ) {
176
+ LOGI("Loading VAD model from asset '%s'\n", asset_path);
177
+ AAssetManager *asset_manager = AAssetManager_fromJava(env, assetManager);
178
+ AAsset *asset = AAssetManager_open(asset_manager, asset_path, AASSET_MODE_STREAMING);
179
+ if (!asset) {
180
+ LOGW("Failed to open VAD asset '%s'\n", asset_path);
181
+ return NULL;
182
+ }
183
+ whisper_model_loader loader = {
184
+ .context = asset,
185
+ .read = &asset_read,
186
+ .eof = &asset_is_eof,
187
+ .close = &asset_close
188
+ };
189
+ return whisper_vad_init_with_params(&loader, vad_params);
190
+ }
191
+
151
192
  extern "C" {
152
193
 
153
194
  JNIEXPORT jlong JNICALL
@@ -530,4 +571,159 @@ Java_com_rnwhisper_WhisperContext_bench(
530
571
  return env->NewStringUTF(result.c_str());
531
572
  }
532
573
 
574
+ // VAD Context JNI implementations
575
+ JNIEXPORT jlong JNICALL
576
+ Java_com_rnwhisper_WhisperContext_initVadContext(
577
+ JNIEnv *env,
578
+ jobject thiz,
579
+ jstring model_path_str
580
+ ) {
581
+ UNUSED(thiz);
582
+ struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
583
+
584
+ struct whisper_vad_context *vad_context = nullptr;
585
+ const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
586
+ vad_context = whisper_vad_init_from_file_with_params(model_path_chars, vad_params);
587
+ env->ReleaseStringUTFChars(model_path_str, model_path_chars);
588
+ return reinterpret_cast<jlong>(vad_context);
589
+ }
590
+
591
+ JNIEXPORT jlong JNICALL
592
+ Java_com_rnwhisper_WhisperContext_initVadContextWithAsset(
593
+ JNIEnv *env,
594
+ jobject thiz,
595
+ jobject asset_manager,
596
+ jstring model_path_str
597
+ ) {
598
+ UNUSED(thiz);
599
+ struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
600
+
601
+ struct whisper_vad_context *vad_context = nullptr;
602
+ const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
603
+ vad_context = whisper_vad_init_from_asset(env, asset_manager, model_path_chars, vad_params);
604
+ env->ReleaseStringUTFChars(model_path_str, model_path_chars);
605
+ return reinterpret_cast<jlong>(vad_context);
606
+ }
607
+
608
+ JNIEXPORT jlong JNICALL
609
+ Java_com_rnwhisper_WhisperContext_initVadContextWithInputStream(
610
+ JNIEnv *env,
611
+ jobject thiz,
612
+ jobject input_stream
613
+ ) {
614
+ UNUSED(thiz);
615
+ struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
616
+
617
+ struct whisper_vad_context *vad_context = nullptr;
618
+ vad_context = whisper_vad_init_from_input_stream(env, input_stream, vad_params);
619
+ return reinterpret_cast<jlong>(vad_context);
620
+ }
621
+
622
+ JNIEXPORT void JNICALL
623
+ Java_com_rnwhisper_WhisperContext_freeVadContext(
624
+ JNIEnv *env,
625
+ jobject thiz,
626
+ jlong vad_context_ptr
627
+ ) {
628
+ UNUSED(env);
629
+ UNUSED(thiz);
630
+ struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
631
+ whisper_vad_free(vad_context);
632
+ }
633
+
634
+ JNIEXPORT jboolean JNICALL
635
+ Java_com_rnwhisper_WhisperContext_vadDetectSpeech(
636
+ JNIEnv *env,
637
+ jobject thiz,
638
+ jlong vad_context_ptr,
639
+ jfloatArray audio_data,
640
+ jint n_samples
641
+ ) {
642
+ UNUSED(thiz);
643
+ struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
644
+
645
+ jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr);
646
+ bool result = whisper_vad_detect_speech(vad_context, audio_data_arr, n_samples);
647
+ env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);
648
+
649
+ return result;
650
+ }
651
+
652
+ JNIEXPORT jlong JNICALL
653
+ Java_com_rnwhisper_WhisperContext_vadGetSegmentsFromProbs(
654
+ JNIEnv *env,
655
+ jobject thiz,
656
+ jlong vad_context_ptr,
657
+ jfloat threshold,
658
+ jint min_speech_duration_ms,
659
+ jint min_silence_duration_ms,
660
+ jfloat max_speech_duration_s,
661
+ jint speech_pad_ms,
662
+ jfloat samples_overlap
663
+ ) {
664
+ UNUSED(thiz);
665
+ struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
666
+
667
+ struct whisper_vad_params vad_params = whisper_vad_default_params();
668
+ vad_params.threshold = threshold;
669
+ vad_params.min_speech_duration_ms = min_speech_duration_ms;
670
+ vad_params.min_silence_duration_ms = min_silence_duration_ms;
671
+ vad_params.max_speech_duration_s = max_speech_duration_s;
672
+ vad_params.speech_pad_ms = speech_pad_ms;
673
+ vad_params.samples_overlap = samples_overlap;
674
+
675
+ struct whisper_vad_segments *segments = whisper_vad_segments_from_probs(vad_context, vad_params);
676
+ return reinterpret_cast<jlong>(segments);
677
+ }
678
+
679
+ JNIEXPORT jint JNICALL
680
+ Java_com_rnwhisper_WhisperContext_vadGetNSegments(
681
+ JNIEnv *env,
682
+ jobject thiz,
683
+ jlong segments_ptr
684
+ ) {
685
+ UNUSED(env);
686
+ UNUSED(thiz);
687
+ struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
688
+ return whisper_vad_segments_n_segments(segments);
689
+ }
690
+
691
+ JNIEXPORT jfloat JNICALL
692
+ Java_com_rnwhisper_WhisperContext_vadGetSegmentT0(
693
+ JNIEnv *env,
694
+ jobject thiz,
695
+ jlong segments_ptr,
696
+ jint index
697
+ ) {
698
+ UNUSED(env);
699
+ UNUSED(thiz);
700
+ struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
701
+ return whisper_vad_segments_get_segment_t0(segments, index);
702
+ }
703
+
704
+ JNIEXPORT jfloat JNICALL
705
+ Java_com_rnwhisper_WhisperContext_vadGetSegmentT1(
706
+ JNIEnv *env,
707
+ jobject thiz,
708
+ jlong segments_ptr,
709
+ jint index
710
+ ) {
711
+ UNUSED(env);
712
+ UNUSED(thiz);
713
+ struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
714
+ return whisper_vad_segments_get_segment_t1(segments, index);
715
+ }
716
+
717
+ JNIEXPORT void JNICALL
718
+ Java_com_rnwhisper_WhisperContext_vadFreeSegments(
719
+ JNIEnv *env,
720
+ jobject thiz,
721
+ jlong segments_ptr
722
+ ) {
723
+ UNUSED(env);
724
+ UNUSED(thiz);
725
+ struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
726
+ whisper_vad_free_segments(segments);
727
+ }
728
+
533
729
  } // extern "C"
@@ -77,6 +77,32 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
77
77
  rnwhisper.releaseAllContexts(promise);
78
78
  }
79
79
 
80
+ // VAD methods
81
+ @ReactMethod
82
+ public void initVadContext(final ReadableMap options, final Promise promise) {
83
+ rnwhisper.initVadContext(options, promise);
84
+ }
85
+
86
+ @ReactMethod
87
+ public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
88
+ rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
89
+ }
90
+
91
+ @ReactMethod
92
+ public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
93
+ rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
94
+ }
95
+
96
+ @ReactMethod
97
+ public void releaseVadContext(double id, Promise promise) {
98
+ rnwhisper.releaseVadContext(id, promise);
99
+ }
100
+
101
+ @ReactMethod
102
+ public void releaseAllVadContexts(Promise promise) {
103
+ rnwhisper.releaseAllVadContexts(promise);
104
+ }
105
+
80
106
  /*
81
107
  * iOS Specific methods, left here for make the turbo module happy:
82
108
  */
@@ -76,4 +76,30 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
76
76
  public void releaseAllContexts(Promise promise) {
77
77
  rnwhisper.releaseAllContexts(promise);
78
78
  }
79
+
80
+ // VAD methods
81
+ @ReactMethod
82
+ public void initVadContext(final ReadableMap options, final Promise promise) {
83
+ rnwhisper.initVadContext(options, promise);
84
+ }
85
+
86
+ @ReactMethod
87
+ public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
88
+ rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
89
+ }
90
+
91
+ @ReactMethod
92
+ public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
93
+ rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
94
+ }
95
+
96
+ @ReactMethod
97
+ public void releaseVadContext(double id, Promise promise) {
98
+ rnwhisper.releaseVadContext(id, promise);
99
+ }
100
+
101
+ @ReactMethod
102
+ public void releaseAllVadContexts(Promise promise) {
103
+ rnwhisper.releaseAllVadContexts(promise);
104
+ }
79
105
  }
package/ios/RNWhisper.mm CHANGED
@@ -1,5 +1,6 @@
1
1
  #import "RNWhisper.h"
2
2
  #import "RNWhisperContext.h"
3
+ #import "RNWhisperVadContext.h"
3
4
  #import "RNWhisperDownloader.h"
4
5
  #import "RNWhisperAudioUtils.h"
5
6
  #import "RNWhisperAudioSessionUtils.h"
@@ -13,6 +14,7 @@
13
14
  @implementation RNWhisper
14
15
 
15
16
  NSMutableDictionary *contexts;
17
+ NSMutableDictionary *vadContexts;
16
18
 
17
19
  RCT_EXPORT_MODULE()
18
20
 
@@ -366,6 +368,15 @@ RCT_REMAP_METHOD(releaseAllContexts,
366
368
  [context invalidate];
367
369
  }
368
370
 
371
+ if (vadContexts != nil) {
372
+ for (NSNumber *contextId in vadContexts) {
373
+ RNWhisperVadContext *vadContext = vadContexts[contextId];
374
+ [vadContext invalidate];
375
+ }
376
+ [vadContexts removeAllObjects];
377
+ vadContexts = nil;
378
+ }
379
+
369
380
  rnwhisper::job_abort_all(); // graceful abort
370
381
 
371
382
  [contexts removeAllObjects];
@@ -437,6 +448,142 @@ RCT_REMAP_METHOD(setAudioSessionActive,
437
448
  resolve(nil);
438
449
  }
439
450
 
451
+ RCT_REMAP_METHOD(initVadContext,
452
+ withVadOptions:(NSDictionary *)vadOptions
453
+ withResolver:(RCTPromiseResolveBlock)resolve
454
+ withRejecter:(RCTPromiseRejectBlock)reject)
455
+ {
456
+ if (vadContexts == nil) {
457
+ vadContexts = [[NSMutableDictionary alloc] init];
458
+ }
459
+
460
+ NSString *modelPath = [vadOptions objectForKey:@"filePath"];
461
+ BOOL isBundleAsset = [[vadOptions objectForKey:@"isBundleAsset"] boolValue];
462
+ BOOL useGpu = [[vadOptions objectForKey:@"useGpu"] boolValue];
463
+ NSNumber *nThreads = [vadOptions objectForKey:@"nThreads"];
464
+
465
+ NSString *path = modelPath;
466
+ if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
467
+ path = [RNWhisperDownloader downloadFile:path toFile:nil];
468
+ }
469
+ if (isBundleAsset) {
470
+ path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
471
+ }
472
+
473
+ int contextId = arc4random_uniform(1000000);
474
+
475
+ RNWhisperVadContext *vadContext = [RNWhisperVadContext
476
+ initWithModelPath:path
477
+ contextId:contextId
478
+ noMetal:!useGpu
479
+ nThreads:nThreads
480
+ ];
481
+ if ([vadContext getVadContext] == NULL) {
482
+ reject(@"whisper_vad_error", @"Failed to load the VAD model", nil);
483
+ return;
484
+ }
485
+
486
+ [vadContexts setObject:vadContext forKey:[NSNumber numberWithInt:contextId]];
487
+
488
+ resolve(@{
489
+ @"contextId": @(contextId),
490
+ @"gpu": @([vadContext isMetalEnabled]),
491
+ @"reasonNoGPU": [vadContext reasonNoMetal],
492
+ });
493
+ }
494
+
495
+ RCT_REMAP_METHOD(vadDetectSpeech,
496
+ withContextId:(int)contextId
497
+ withAudioData:(NSString *)audioDataBase64
498
+ withOptions:(NSDictionary *)options
499
+ withResolver:(RCTPromiseResolveBlock)resolve
500
+ withRejecter:(RCTPromiseRejectBlock)reject)
501
+ {
502
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
503
+
504
+ if (vadContext == nil) {
505
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
506
+ return;
507
+ }
508
+
509
+ // Decode base64 audio data
510
+ NSData *audioData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
511
+ if (audioData == nil) {
512
+ reject(@"whisper_vad_error", @"Invalid audio data", nil);
513
+ return;
514
+ }
515
+
516
+ NSArray *segments = [vadContext detectSpeech:audioData options:options];
517
+ resolve(segments);
518
+ }
519
+
520
+ RCT_REMAP_METHOD(vadDetectSpeechFile,
521
+ withVadContextId:(int)contextId
522
+ withFilePath:(NSString *)filePath
523
+ withOptions:(NSDictionary *)options
524
+ withResolver:(RCTPromiseResolveBlock)resolve
525
+ withRejecter:(RCTPromiseRejectBlock)reject)
526
+ {
527
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
528
+
529
+ if (vadContext == nil) {
530
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
531
+ return;
532
+ }
533
+
534
+ // Handle different input types like transcribeFile does
535
+ float *data = nil;
536
+ int count = 0;
537
+ if ([filePath hasPrefix:@"http://"] || [filePath hasPrefix:@"https://"]) {
538
+ NSString *path = [RNWhisperDownloader downloadFile:filePath toFile:nil];
539
+ data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
540
+ } else if ([filePath hasPrefix:@"data:audio/wav;base64,"]) {
541
+ NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[filePath substringFromIndex:22] options:0];
542
+ data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
543
+ } else {
544
+ data = [RNWhisperAudioUtils decodeWaveFile:filePath count:&count];
545
+ }
546
+
547
+ if (data == nil) {
548
+ reject(@"whisper_vad_error", @"Failed to load or decode audio file", nil);
549
+ return;
550
+ }
551
+
552
+ // Convert float32 data to NSData for VAD context
553
+ NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
554
+
555
+ NSArray *segments = [vadContext detectSpeech:audioData options:options];
556
+ resolve(segments);
557
+ }
558
+
559
+ RCT_REMAP_METHOD(releaseVadContext,
560
+ withVadContextId:(int)contextId
561
+ withResolver:(RCTPromiseResolveBlock)resolve
562
+ withRejecter:(RCTPromiseRejectBlock)reject)
563
+ {
564
+ RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
565
+ if (vadContext == nil) {
566
+ reject(@"whisper_vad_error", @"VAD context not found", nil);
567
+ return;
568
+ }
569
+ [vadContext invalidate];
570
+ [vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
571
+ resolve(nil);
572
+ }
573
+
574
+ RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
575
+ withRejecter:(RCTPromiseRejectBlock)reject)
576
+ {
577
+ if (vadContexts != nil) {
578
+ for (NSNumber *contextId in vadContexts) {
579
+ RNWhisperVadContext *vadContext = vadContexts[contextId];
580
+ [vadContext invalidate];
581
+ }
582
+ [vadContexts removeAllObjects];
583
+ }
584
+ resolve(nil);
585
+ }
586
+
440
587
  #ifdef RCT_NEW_ARCH_ENABLED
441
588
  - (std::shared_ptr<facebook::react::TurboModule>)getTurboModule:
442
589
  (const facebook::react::ObjCTurboModule::InitParams &)params
@@ -36,36 +36,30 @@
36
36
  NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
37
37
  cparams.use_gpu = false;
38
38
  }
39
+ reasonNoMetal = @"Metal is not enabled in this build";
39
40
  #endif
40
41
 
41
42
  #ifdef WSP_GGML_USE_METAL
42
43
  if (cparams.use_gpu) {
43
- #if TARGET_OS_SIMULATOR
44
- NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
45
- cparams.use_gpu = false;
46
- #else // TARGET_OS_SIMULATOR
47
- // Check ggml-metal availability
48
- NSError * error = nil;
49
44
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
50
- id<MTLLibrary> library = [device
51
- newLibraryWithSource:@"#include <metal_stdlib>\n"
52
- "using namespace metal;"
53
- "kernel void test() { simd_sum(0); }"
54
- options:nil
55
- error:&error
56
- ];
57
- if (error) {
58
- reasonNoMetal = [error localizedDescription];
59
- } else {
60
- id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
61
- id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
62
- if (pipeline == nil) {
63
- reasonNoMetal = [error localizedDescription];
64
- NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
65
- cparams.use_gpu = false;
66
- }
45
+
46
+ // Check ggml-metal availability
47
+ BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
48
+ if (@available(iOS 16.0, tvOS 16.0, *)) {
49
+ supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
67
50
  }
68
- #endif // TARGET_OS_SIMULATOR
51
+ if (!supportsGgmlMetal) {
52
+ cparams.use_gpu = false;
53
+ reasonNoMetal = @"Metal is not supported in this device";
54
+ }
55
+
56
+ #if TARGET_OS_SIMULATOR
57
+ // Use the backend, but no layers because not supported fully on simulator
58
+ cparams.use_gpu = false;
59
+ reasonNoMetal = @"Metal is not supported in simulator";
60
+ #endif
61
+
62
+ device = nil;
69
63
  }
70
64
  #endif // WSP_GGML_USE_METAL
71
65
 
@@ -0,0 +1,29 @@
1
+ #ifdef __cplusplus
2
+ #if RNWHISPER_BUILD_FROM_SOURCE
3
+ #import "whisper.h"
4
+ #import "rn-whisper.h"
5
+ #else
6
+ #import <rnwhisper/whisper.h>
7
+ #import <rnwhisper/rn-whisper.h>
8
+ #endif
9
+ #endif
10
+
11
+ #import <Foundation/Foundation.h>
12
+
13
+ @interface RNWhisperVadContext : NSObject {
14
+ int contextId;
15
+ dispatch_queue_t dQueue;
16
+ struct whisper_vad_context * vctx;
17
+ NSString * reasonNoMetal;
18
+ bool isMetalEnabled;
19
+ }
20
+
21
+ + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads;
22
+ - (bool)isMetalEnabled;
23
+ - (NSString *)reasonNoMetal;
24
+ - (struct whisper_vad_context *)getVadContext;
25
+ - (dispatch_queue_t)getDispatchQueue;
26
+ - (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options;
27
+ - (void)invalidate;
28
+
29
+ @end
@@ -0,0 +1,152 @@
1
+ #import "RNWhisperVadContext.h"
2
+ #import "RNWhisperAudioUtils.h"
3
+ #import <Metal/Metal.h>
4
+
5
+ @implementation RNWhisperVadContext
6
+
7
+ + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads {
8
+ RNWhisperVadContext *context = [[RNWhisperVadContext alloc] init];
9
+
10
+ context->contextId = contextId;
11
+ context->dQueue = dispatch_queue_create("rnwhisper.vad.serial_queue", DISPATCH_QUEUE_SERIAL);
12
+ NSString *reasonNoMetal = @"";
13
+
14
+ // Set up VAD context parameters
15
+ struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
16
+ ctx_params.use_gpu = !noMetal;
17
+ if (nThreads != nil) {
18
+ ctx_params.n_threads = [nThreads intValue];
19
+ }
20
+
21
+ #ifdef WSP_GGML_USE_METAL
22
+ if (ctx_params.use_gpu) {
23
+ id<MTLDevice> device = MTLCreateSystemDefaultDevice();
24
+
25
+ // Check ggml-metal availability
26
+ BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
27
+ if (@available(iOS 16.0, tvOS 16.0, *)) {
28
+ supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
29
+ }
30
+ if (!supportsGgmlMetal) {
31
+ ctx_params.use_gpu = false;
32
+ reasonNoMetal = @"Metal is not supported in this device";
33
+ }
34
+
35
+ #if TARGET_OS_SIMULATOR
36
+ // Use the backend, but no layers because not supported fully on simulator
37
+ ctx_params.use_gpu = false;
38
+ reasonNoMetal = @"Metal is not supported in simulator";
39
+ #endif
40
+
41
+ device = nil;
42
+ }
43
+ #endif // WSP_GGML_USE_METAL
44
+
45
+ // Initialize VAD context
46
+ context->vctx = whisper_vad_init_from_file_with_params([modelPath UTF8String], ctx_params);
47
+
48
+ if (context->vctx == NULL) {
49
+ NSLog(@"Failed to initialize VAD context from model: %@", modelPath);
50
+ return nil;
51
+ }
52
+
53
+ // Check GPU status
54
+ context->isMetalEnabled = ctx_params.use_gpu;
55
+ context->reasonNoMetal = reasonNoMetal;
56
+
57
+ return context;
58
+ }
59
+
60
+ - (bool)isMetalEnabled {
61
+ return isMetalEnabled;
62
+ }
63
+
64
+ - (NSString *)reasonNoMetal {
65
+ return reasonNoMetal;
66
+ }
67
+
68
+ - (struct whisper_vad_context *)getVadContext {
69
+ return vctx;
70
+ }
71
+
72
+ - (dispatch_queue_t)getDispatchQueue {
73
+ return dQueue;
74
+ }
75
+
76
+ - (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options {
77
+ if (vctx == NULL) {
78
+ NSLog(@"VAD context is null");
79
+ return @[];
80
+ }
81
+
82
+ // Convert NSData to float array
83
+ const float *samples = (const float *)[audioData bytes];
84
+ int n_samples = (int)[audioData length] / sizeof(float);
85
+
86
+ // Run VAD detection
87
+ bool speechDetected = whisper_vad_detect_speech(vctx, samples, n_samples);
88
+ if (!speechDetected) {
89
+ return @[];
90
+ }
91
+
92
+ // Get VAD parameters
93
+ struct whisper_vad_params vad_params = whisper_vad_default_params();
94
+
95
+ if ([options objectForKey:@"threshold"]) {
96
+ vad_params.threshold = [[options objectForKey:@"threshold"] floatValue];
97
+ }
98
+ if ([options objectForKey:@"minSpeechDurationMs"]) {
99
+ vad_params.min_speech_duration_ms = [[options objectForKey:@"minSpeechDurationMs"] intValue];
100
+ }
101
+ if ([options objectForKey:@"minSilenceDurationMs"]) {
102
+ vad_params.min_silence_duration_ms = [[options objectForKey:@"minSilenceDurationMs"] intValue];
103
+ }
104
+ if ([options objectForKey:@"maxSpeechDurationS"]) {
105
+ vad_params.max_speech_duration_s = [[options objectForKey:@"maxSpeechDurationS"] floatValue];
106
+ }
107
+ if ([options objectForKey:@"speechPadMs"]) {
108
+ vad_params.speech_pad_ms = [[options objectForKey:@"speechPadMs"] intValue];
109
+ }
110
+ if ([options objectForKey:@"samplesOverlap"]) {
111
+ vad_params.samples_overlap = [[options objectForKey:@"samplesOverlap"] floatValue];
112
+ }
113
+
114
+ // Get segments from VAD probabilities
115
+ struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, vad_params);
116
+ if (segments == NULL) {
117
+ return @[];
118
+ }
119
+
120
+ // Convert segments to NSArray
121
+ NSMutableArray *result = [[NSMutableArray alloc] init];
122
+ int n_segments = whisper_vad_segments_n_segments(segments);
123
+
124
+ for (int i = 0; i < n_segments; i++) {
125
+ float t0 = whisper_vad_segments_get_segment_t0(segments, i);
126
+ float t1 = whisper_vad_segments_get_segment_t1(segments, i);
127
+
128
+ NSDictionary *segment = @{
129
+ @"t0": @(t0),
130
+ @"t1": @(t1)
131
+ };
132
+ [result addObject:segment];
133
+ }
134
+
135
+ // Clean up
136
+ whisper_vad_free_segments(segments);
137
+
138
+ return result;
139
+ }
140
+
141
+ - (void)invalidate {
142
+ if (vctx != NULL) {
143
+ whisper_vad_free(vctx);
144
+ vctx = NULL;
145
+ }
146
+ }
147
+
148
+ - (void)dealloc {
149
+ [self invalidate];
150
+ }
151
+
152
+ @end