whisper.rn 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -0
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +211 -0
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +34 -4
- package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +157 -0
- package/android/src/main/jni.cpp +196 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
- package/ios/RNWhisper.mm +147 -0
- package/ios/RNWhisperContext.mm +18 -24
- package/ios/RNWhisperVadContext.h +29 -0
- package/ios/RNWhisperVadContext.mm +152 -0
- package/jest/mock.js +19 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +111 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +112 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +35 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +39 -3
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +48 -0
- package/src/index.ts +132 -1
package/android/src/main/jni.cpp
CHANGED
|
@@ -148,6 +148,47 @@ static struct whisper_context *whisper_init_from_asset(
|
|
|
148
148
|
return whisper_init_with_params(&loader, cparams);
|
|
149
149
|
}
|
|
150
150
|
|
|
151
|
+
// VAD context initialization functions
|
|
152
|
+
static struct whisper_vad_context *whisper_vad_init_from_input_stream(
|
|
153
|
+
JNIEnv *env,
|
|
154
|
+
jobject input_stream, // PushbackInputStream
|
|
155
|
+
struct whisper_vad_context_params vad_params
|
|
156
|
+
) {
|
|
157
|
+
input_stream_context *context = new input_stream_context;
|
|
158
|
+
context->env = env;
|
|
159
|
+
context->input_stream = env->NewGlobalRef(input_stream);
|
|
160
|
+
|
|
161
|
+
whisper_model_loader loader = {
|
|
162
|
+
.context = context,
|
|
163
|
+
.read = &input_stream_read,
|
|
164
|
+
.eof = &input_stream_is_eof,
|
|
165
|
+
.close = &input_stream_close
|
|
166
|
+
};
|
|
167
|
+
return whisper_vad_init_with_params(&loader, vad_params);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
static struct whisper_vad_context *whisper_vad_init_from_asset(
|
|
171
|
+
JNIEnv *env,
|
|
172
|
+
jobject assetManager,
|
|
173
|
+
const char *asset_path,
|
|
174
|
+
struct whisper_vad_context_params vad_params
|
|
175
|
+
) {
|
|
176
|
+
LOGI("Loading VAD model from asset '%s'\n", asset_path);
|
|
177
|
+
AAssetManager *asset_manager = AAssetManager_fromJava(env, assetManager);
|
|
178
|
+
AAsset *asset = AAssetManager_open(asset_manager, asset_path, AASSET_MODE_STREAMING);
|
|
179
|
+
if (!asset) {
|
|
180
|
+
LOGW("Failed to open VAD asset '%s'\n", asset_path);
|
|
181
|
+
return NULL;
|
|
182
|
+
}
|
|
183
|
+
whisper_model_loader loader = {
|
|
184
|
+
.context = asset,
|
|
185
|
+
.read = &asset_read,
|
|
186
|
+
.eof = &asset_is_eof,
|
|
187
|
+
.close = &asset_close
|
|
188
|
+
};
|
|
189
|
+
return whisper_vad_init_with_params(&loader, vad_params);
|
|
190
|
+
}
|
|
191
|
+
|
|
151
192
|
extern "C" {
|
|
152
193
|
|
|
153
194
|
JNIEXPORT jlong JNICALL
|
|
@@ -530,4 +571,159 @@ Java_com_rnwhisper_WhisperContext_bench(
|
|
|
530
571
|
return env->NewStringUTF(result.c_str());
|
|
531
572
|
}
|
|
532
573
|
|
|
574
|
+
// VAD Context JNI implementations
|
|
575
|
+
JNIEXPORT jlong JNICALL
|
|
576
|
+
Java_com_rnwhisper_WhisperContext_initVadContext(
|
|
577
|
+
JNIEnv *env,
|
|
578
|
+
jobject thiz,
|
|
579
|
+
jstring model_path_str
|
|
580
|
+
) {
|
|
581
|
+
UNUSED(thiz);
|
|
582
|
+
struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
|
|
583
|
+
|
|
584
|
+
struct whisper_vad_context *vad_context = nullptr;
|
|
585
|
+
const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
|
|
586
|
+
vad_context = whisper_vad_init_from_file_with_params(model_path_chars, vad_params);
|
|
587
|
+
env->ReleaseStringUTFChars(model_path_str, model_path_chars);
|
|
588
|
+
return reinterpret_cast<jlong>(vad_context);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
JNIEXPORT jlong JNICALL
|
|
592
|
+
Java_com_rnwhisper_WhisperContext_initVadContextWithAsset(
|
|
593
|
+
JNIEnv *env,
|
|
594
|
+
jobject thiz,
|
|
595
|
+
jobject asset_manager,
|
|
596
|
+
jstring model_path_str
|
|
597
|
+
) {
|
|
598
|
+
UNUSED(thiz);
|
|
599
|
+
struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
|
|
600
|
+
|
|
601
|
+
struct whisper_vad_context *vad_context = nullptr;
|
|
602
|
+
const char *model_path_chars = env->GetStringUTFChars(model_path_str, nullptr);
|
|
603
|
+
vad_context = whisper_vad_init_from_asset(env, asset_manager, model_path_chars, vad_params);
|
|
604
|
+
env->ReleaseStringUTFChars(model_path_str, model_path_chars);
|
|
605
|
+
return reinterpret_cast<jlong>(vad_context);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
JNIEXPORT jlong JNICALL
|
|
609
|
+
Java_com_rnwhisper_WhisperContext_initVadContextWithInputStream(
|
|
610
|
+
JNIEnv *env,
|
|
611
|
+
jobject thiz,
|
|
612
|
+
jobject input_stream
|
|
613
|
+
) {
|
|
614
|
+
UNUSED(thiz);
|
|
615
|
+
struct whisper_vad_context_params vad_params = whisper_vad_default_context_params();
|
|
616
|
+
|
|
617
|
+
struct whisper_vad_context *vad_context = nullptr;
|
|
618
|
+
vad_context = whisper_vad_init_from_input_stream(env, input_stream, vad_params);
|
|
619
|
+
return reinterpret_cast<jlong>(vad_context);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
JNIEXPORT void JNICALL
|
|
623
|
+
Java_com_rnwhisper_WhisperContext_freeVadContext(
|
|
624
|
+
JNIEnv *env,
|
|
625
|
+
jobject thiz,
|
|
626
|
+
jlong vad_context_ptr
|
|
627
|
+
) {
|
|
628
|
+
UNUSED(env);
|
|
629
|
+
UNUSED(thiz);
|
|
630
|
+
struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
|
|
631
|
+
whisper_vad_free(vad_context);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
JNIEXPORT jboolean JNICALL
|
|
635
|
+
Java_com_rnwhisper_WhisperContext_vadDetectSpeech(
|
|
636
|
+
JNIEnv *env,
|
|
637
|
+
jobject thiz,
|
|
638
|
+
jlong vad_context_ptr,
|
|
639
|
+
jfloatArray audio_data,
|
|
640
|
+
jint n_samples
|
|
641
|
+
) {
|
|
642
|
+
UNUSED(thiz);
|
|
643
|
+
struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
|
|
644
|
+
|
|
645
|
+
jfloat *audio_data_arr = env->GetFloatArrayElements(audio_data, nullptr);
|
|
646
|
+
bool result = whisper_vad_detect_speech(vad_context, audio_data_arr, n_samples);
|
|
647
|
+
env->ReleaseFloatArrayElements(audio_data, audio_data_arr, JNI_ABORT);
|
|
648
|
+
|
|
649
|
+
return result;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
JNIEXPORT jlong JNICALL
|
|
653
|
+
Java_com_rnwhisper_WhisperContext_vadGetSegmentsFromProbs(
|
|
654
|
+
JNIEnv *env,
|
|
655
|
+
jobject thiz,
|
|
656
|
+
jlong vad_context_ptr,
|
|
657
|
+
jfloat threshold,
|
|
658
|
+
jint min_speech_duration_ms,
|
|
659
|
+
jint min_silence_duration_ms,
|
|
660
|
+
jfloat max_speech_duration_s,
|
|
661
|
+
jint speech_pad_ms,
|
|
662
|
+
jfloat samples_overlap
|
|
663
|
+
) {
|
|
664
|
+
UNUSED(thiz);
|
|
665
|
+
struct whisper_vad_context *vad_context = reinterpret_cast<struct whisper_vad_context *>(vad_context_ptr);
|
|
666
|
+
|
|
667
|
+
struct whisper_vad_params vad_params = whisper_vad_default_params();
|
|
668
|
+
vad_params.threshold = threshold;
|
|
669
|
+
vad_params.min_speech_duration_ms = min_speech_duration_ms;
|
|
670
|
+
vad_params.min_silence_duration_ms = min_silence_duration_ms;
|
|
671
|
+
vad_params.max_speech_duration_s = max_speech_duration_s;
|
|
672
|
+
vad_params.speech_pad_ms = speech_pad_ms;
|
|
673
|
+
vad_params.samples_overlap = samples_overlap;
|
|
674
|
+
|
|
675
|
+
struct whisper_vad_segments *segments = whisper_vad_segments_from_probs(vad_context, vad_params);
|
|
676
|
+
return reinterpret_cast<jlong>(segments);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
JNIEXPORT jint JNICALL
|
|
680
|
+
Java_com_rnwhisper_WhisperContext_vadGetNSegments(
|
|
681
|
+
JNIEnv *env,
|
|
682
|
+
jobject thiz,
|
|
683
|
+
jlong segments_ptr
|
|
684
|
+
) {
|
|
685
|
+
UNUSED(env);
|
|
686
|
+
UNUSED(thiz);
|
|
687
|
+
struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
|
|
688
|
+
return whisper_vad_segments_n_segments(segments);
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
JNIEXPORT jfloat JNICALL
|
|
692
|
+
Java_com_rnwhisper_WhisperContext_vadGetSegmentT0(
|
|
693
|
+
JNIEnv *env,
|
|
694
|
+
jobject thiz,
|
|
695
|
+
jlong segments_ptr,
|
|
696
|
+
jint index
|
|
697
|
+
) {
|
|
698
|
+
UNUSED(env);
|
|
699
|
+
UNUSED(thiz);
|
|
700
|
+
struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
|
|
701
|
+
return whisper_vad_segments_get_segment_t0(segments, index);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
JNIEXPORT jfloat JNICALL
|
|
705
|
+
Java_com_rnwhisper_WhisperContext_vadGetSegmentT1(
|
|
706
|
+
JNIEnv *env,
|
|
707
|
+
jobject thiz,
|
|
708
|
+
jlong segments_ptr,
|
|
709
|
+
jint index
|
|
710
|
+
) {
|
|
711
|
+
UNUSED(env);
|
|
712
|
+
UNUSED(thiz);
|
|
713
|
+
struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
|
|
714
|
+
return whisper_vad_segments_get_segment_t1(segments, index);
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
JNIEXPORT void JNICALL
|
|
718
|
+
Java_com_rnwhisper_WhisperContext_vadFreeSegments(
|
|
719
|
+
JNIEnv *env,
|
|
720
|
+
jobject thiz,
|
|
721
|
+
jlong segments_ptr
|
|
722
|
+
) {
|
|
723
|
+
UNUSED(env);
|
|
724
|
+
UNUSED(thiz);
|
|
725
|
+
struct whisper_vad_segments *segments = reinterpret_cast<struct whisper_vad_segments *>(segments_ptr);
|
|
726
|
+
whisper_vad_free_segments(segments);
|
|
727
|
+
}
|
|
728
|
+
|
|
533
729
|
} // extern "C"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -77,6 +77,32 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
|
|
|
77
77
|
rnwhisper.releaseAllContexts(promise);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
+
// VAD methods
|
|
81
|
+
@ReactMethod
|
|
82
|
+
public void initVadContext(final ReadableMap options, final Promise promise) {
|
|
83
|
+
rnwhisper.initVadContext(options, promise);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@ReactMethod
|
|
87
|
+
public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
|
|
88
|
+
rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@ReactMethod
|
|
92
|
+
public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
|
|
93
|
+
rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@ReactMethod
|
|
97
|
+
public void releaseVadContext(double id, Promise promise) {
|
|
98
|
+
rnwhisper.releaseVadContext(id, promise);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@ReactMethod
|
|
102
|
+
public void releaseAllVadContexts(Promise promise) {
|
|
103
|
+
rnwhisper.releaseAllVadContexts(promise);
|
|
104
|
+
}
|
|
105
|
+
|
|
80
106
|
/*
|
|
81
107
|
* iOS Specific methods, left here for make the turbo module happy:
|
|
82
108
|
*/
|
|
@@ -76,4 +76,30 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
|
|
|
76
76
|
public void releaseAllContexts(Promise promise) {
|
|
77
77
|
rnwhisper.releaseAllContexts(promise);
|
|
78
78
|
}
|
|
79
|
+
|
|
80
|
+
// VAD methods
|
|
81
|
+
@ReactMethod
|
|
82
|
+
public void initVadContext(final ReadableMap options, final Promise promise) {
|
|
83
|
+
rnwhisper.initVadContext(options, promise);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@ReactMethod
|
|
87
|
+
public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
|
|
88
|
+
rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@ReactMethod
|
|
92
|
+
public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
|
|
93
|
+
rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@ReactMethod
|
|
97
|
+
public void releaseVadContext(double id, Promise promise) {
|
|
98
|
+
rnwhisper.releaseVadContext(id, promise);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@ReactMethod
|
|
102
|
+
public void releaseAllVadContexts(Promise promise) {
|
|
103
|
+
rnwhisper.releaseAllVadContexts(promise);
|
|
104
|
+
}
|
|
79
105
|
}
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#import "RNWhisper.h"
|
|
2
2
|
#import "RNWhisperContext.h"
|
|
3
|
+
#import "RNWhisperVadContext.h"
|
|
3
4
|
#import "RNWhisperDownloader.h"
|
|
4
5
|
#import "RNWhisperAudioUtils.h"
|
|
5
6
|
#import "RNWhisperAudioSessionUtils.h"
|
|
@@ -13,6 +14,7 @@
|
|
|
13
14
|
@implementation RNWhisper
|
|
14
15
|
|
|
15
16
|
NSMutableDictionary *contexts;
|
|
17
|
+
NSMutableDictionary *vadContexts;
|
|
16
18
|
|
|
17
19
|
RCT_EXPORT_MODULE()
|
|
18
20
|
|
|
@@ -366,6 +368,15 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
366
368
|
[context invalidate];
|
|
367
369
|
}
|
|
368
370
|
|
|
371
|
+
if (vadContexts != nil) {
|
|
372
|
+
for (NSNumber *contextId in vadContexts) {
|
|
373
|
+
RNWhisperVadContext *vadContext = vadContexts[contextId];
|
|
374
|
+
[vadContext invalidate];
|
|
375
|
+
}
|
|
376
|
+
[vadContexts removeAllObjects];
|
|
377
|
+
vadContexts = nil;
|
|
378
|
+
}
|
|
379
|
+
|
|
369
380
|
rnwhisper::job_abort_all(); // graceful abort
|
|
370
381
|
|
|
371
382
|
[contexts removeAllObjects];
|
|
@@ -437,6 +448,142 @@ RCT_REMAP_METHOD(setAudioSessionActive,
|
|
|
437
448
|
resolve(nil);
|
|
438
449
|
}
|
|
439
450
|
|
|
451
|
+
RCT_REMAP_METHOD(initVadContext,
|
|
452
|
+
withVadOptions:(NSDictionary *)vadOptions
|
|
453
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
454
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
455
|
+
{
|
|
456
|
+
if (vadContexts == nil) {
|
|
457
|
+
vadContexts = [[NSMutableDictionary alloc] init];
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
NSString *modelPath = [vadOptions objectForKey:@"filePath"];
|
|
461
|
+
BOOL isBundleAsset = [[vadOptions objectForKey:@"isBundleAsset"] boolValue];
|
|
462
|
+
BOOL useGpu = [[vadOptions objectForKey:@"useGpu"] boolValue];
|
|
463
|
+
NSNumber *nThreads = [vadOptions objectForKey:@"nThreads"];
|
|
464
|
+
|
|
465
|
+
NSString *path = modelPath;
|
|
466
|
+
if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
|
|
467
|
+
path = [RNWhisperDownloader downloadFile:path toFile:nil];
|
|
468
|
+
}
|
|
469
|
+
if (isBundleAsset) {
|
|
470
|
+
path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
int contextId = arc4random_uniform(1000000);
|
|
474
|
+
|
|
475
|
+
RNWhisperVadContext *vadContext = [RNWhisperVadContext
|
|
476
|
+
initWithModelPath:path
|
|
477
|
+
contextId:contextId
|
|
478
|
+
noMetal:!useGpu
|
|
479
|
+
nThreads:nThreads
|
|
480
|
+
];
|
|
481
|
+
if ([vadContext getVadContext] == NULL) {
|
|
482
|
+
reject(@"whisper_vad_error", @"Failed to load the VAD model", nil);
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
[vadContexts setObject:vadContext forKey:[NSNumber numberWithInt:contextId]];
|
|
487
|
+
|
|
488
|
+
resolve(@{
|
|
489
|
+
@"contextId": @(contextId),
|
|
490
|
+
@"gpu": @([vadContext isMetalEnabled]),
|
|
491
|
+
@"reasonNoGPU": [vadContext reasonNoMetal],
|
|
492
|
+
});
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
RCT_REMAP_METHOD(vadDetectSpeech,
|
|
496
|
+
withContextId:(int)contextId
|
|
497
|
+
withAudioData:(NSString *)audioDataBase64
|
|
498
|
+
withOptions:(NSDictionary *)options
|
|
499
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
500
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
501
|
+
{
|
|
502
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
503
|
+
|
|
504
|
+
if (vadContext == nil) {
|
|
505
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
// Decode base64 audio data
|
|
510
|
+
NSData *audioData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
|
|
511
|
+
if (audioData == nil) {
|
|
512
|
+
reject(@"whisper_vad_error", @"Invalid audio data", nil);
|
|
513
|
+
return;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
NSArray *segments = [vadContext detectSpeech:audioData options:options];
|
|
517
|
+
resolve(segments);
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
RCT_REMAP_METHOD(vadDetectSpeechFile,
|
|
521
|
+
withVadContextId:(int)contextId
|
|
522
|
+
withFilePath:(NSString *)filePath
|
|
523
|
+
withOptions:(NSDictionary *)options
|
|
524
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
525
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
526
|
+
{
|
|
527
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
528
|
+
|
|
529
|
+
if (vadContext == nil) {
|
|
530
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
531
|
+
return;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Handle different input types like transcribeFile does
|
|
535
|
+
float *data = nil;
|
|
536
|
+
int count = 0;
|
|
537
|
+
if ([filePath hasPrefix:@"http://"] || [filePath hasPrefix:@"https://"]) {
|
|
538
|
+
NSString *path = [RNWhisperDownloader downloadFile:filePath toFile:nil];
|
|
539
|
+
data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
540
|
+
} else if ([filePath hasPrefix:@"data:audio/wav;base64,"]) {
|
|
541
|
+
NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[filePath substringFromIndex:22] options:0];
|
|
542
|
+
data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
|
|
543
|
+
} else {
|
|
544
|
+
data = [RNWhisperAudioUtils decodeWaveFile:filePath count:&count];
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
if (data == nil) {
|
|
548
|
+
reject(@"whisper_vad_error", @"Failed to load or decode audio file", nil);
|
|
549
|
+
return;
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
// Convert float32 data to NSData for VAD context
|
|
553
|
+
NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
|
|
554
|
+
|
|
555
|
+
NSArray *segments = [vadContext detectSpeech:audioData options:options];
|
|
556
|
+
resolve(segments);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
RCT_REMAP_METHOD(releaseVadContext,
|
|
560
|
+
withVadContextId:(int)contextId
|
|
561
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
562
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
563
|
+
{
|
|
564
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
565
|
+
if (vadContext == nil) {
|
|
566
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
567
|
+
return;
|
|
568
|
+
}
|
|
569
|
+
[vadContext invalidate];
|
|
570
|
+
[vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
571
|
+
resolve(nil);
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
|
|
575
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
576
|
+
{
|
|
577
|
+
if (vadContexts != nil) {
|
|
578
|
+
for (NSNumber *contextId in vadContexts) {
|
|
579
|
+
RNWhisperVadContext *vadContext = vadContexts[contextId];
|
|
580
|
+
[vadContext invalidate];
|
|
581
|
+
}
|
|
582
|
+
[vadContexts removeAllObjects];
|
|
583
|
+
}
|
|
584
|
+
resolve(nil);
|
|
585
|
+
}
|
|
586
|
+
|
|
440
587
|
#ifdef RCT_NEW_ARCH_ENABLED
|
|
441
588
|
- (std::shared_ptr<facebook::react::TurboModule>)getTurboModule:
|
|
442
589
|
(const facebook::react::ObjCTurboModule::InitParams &)params
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -36,36 +36,30 @@
|
|
|
36
36
|
NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
|
|
37
37
|
cparams.use_gpu = false;
|
|
38
38
|
}
|
|
39
|
+
reasonNoMetal = @"Metal is not enabled in this build";
|
|
39
40
|
#endif
|
|
40
41
|
|
|
41
42
|
#ifdef WSP_GGML_USE_METAL
|
|
42
43
|
if (cparams.use_gpu) {
|
|
43
|
-
#if TARGET_OS_SIMULATOR
|
|
44
|
-
NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
|
|
45
|
-
cparams.use_gpu = false;
|
|
46
|
-
#else // TARGET_OS_SIMULATOR
|
|
47
|
-
// Check ggml-metal availability
|
|
48
|
-
NSError * error = nil;
|
|
49
44
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
error:&error
|
|
56
|
-
];
|
|
57
|
-
if (error) {
|
|
58
|
-
reasonNoMetal = [error localizedDescription];
|
|
59
|
-
} else {
|
|
60
|
-
id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
|
|
61
|
-
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
|
|
62
|
-
if (pipeline == nil) {
|
|
63
|
-
reasonNoMetal = [error localizedDescription];
|
|
64
|
-
NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
|
|
65
|
-
cparams.use_gpu = false;
|
|
66
|
-
}
|
|
45
|
+
|
|
46
|
+
// Check ggml-metal availability
|
|
47
|
+
BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
|
48
|
+
if (@available(iOS 16.0, tvOS 16.0, *)) {
|
|
49
|
+
supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
|
67
50
|
}
|
|
68
|
-
|
|
51
|
+
if (!supportsGgmlMetal) {
|
|
52
|
+
cparams.use_gpu = false;
|
|
53
|
+
reasonNoMetal = @"Metal is not supported in this device";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#if TARGET_OS_SIMULATOR
|
|
57
|
+
// Use the backend, but no layers because not supported fully on simulator
|
|
58
|
+
cparams.use_gpu = false;
|
|
59
|
+
reasonNoMetal = @"Metal is not supported in simulator";
|
|
60
|
+
#endif
|
|
61
|
+
|
|
62
|
+
device = nil;
|
|
69
63
|
}
|
|
70
64
|
#endif // WSP_GGML_USE_METAL
|
|
71
65
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#ifdef __cplusplus
|
|
2
|
+
#if RNWHISPER_BUILD_FROM_SOURCE
|
|
3
|
+
#import "whisper.h"
|
|
4
|
+
#import "rn-whisper.h"
|
|
5
|
+
#else
|
|
6
|
+
#import <rnwhisper/whisper.h>
|
|
7
|
+
#import <rnwhisper/rn-whisper.h>
|
|
8
|
+
#endif
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#import <Foundation/Foundation.h>
|
|
12
|
+
|
|
13
|
+
@interface RNWhisperVadContext : NSObject {
|
|
14
|
+
int contextId;
|
|
15
|
+
dispatch_queue_t dQueue;
|
|
16
|
+
struct whisper_vad_context * vctx;
|
|
17
|
+
NSString * reasonNoMetal;
|
|
18
|
+
bool isMetalEnabled;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads;
|
|
22
|
+
- (bool)isMetalEnabled;
|
|
23
|
+
- (NSString *)reasonNoMetal;
|
|
24
|
+
- (struct whisper_vad_context *)getVadContext;
|
|
25
|
+
- (dispatch_queue_t)getDispatchQueue;
|
|
26
|
+
- (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options;
|
|
27
|
+
- (void)invalidate;
|
|
28
|
+
|
|
29
|
+
@end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#import "RNWhisperVadContext.h"
|
|
2
|
+
#import "RNWhisperAudioUtils.h"
|
|
3
|
+
#import <Metal/Metal.h>
|
|
4
|
+
|
|
5
|
+
@implementation RNWhisperVadContext
|
|
6
|
+
|
|
7
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads {
|
|
8
|
+
RNWhisperVadContext *context = [[RNWhisperVadContext alloc] init];
|
|
9
|
+
|
|
10
|
+
context->contextId = contextId;
|
|
11
|
+
context->dQueue = dispatch_queue_create("rnwhisper.vad.serial_queue", DISPATCH_QUEUE_SERIAL);
|
|
12
|
+
NSString *reasonNoMetal = @"";
|
|
13
|
+
|
|
14
|
+
// Set up VAD context parameters
|
|
15
|
+
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
|
|
16
|
+
ctx_params.use_gpu = !noMetal;
|
|
17
|
+
if (nThreads != nil) {
|
|
18
|
+
ctx_params.n_threads = [nThreads intValue];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
#ifdef WSP_GGML_USE_METAL
|
|
22
|
+
if (ctx_params.use_gpu) {
|
|
23
|
+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
24
|
+
|
|
25
|
+
// Check ggml-metal availability
|
|
26
|
+
BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
|
27
|
+
if (@available(iOS 16.0, tvOS 16.0, *)) {
|
|
28
|
+
supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
|
29
|
+
}
|
|
30
|
+
if (!supportsGgmlMetal) {
|
|
31
|
+
ctx_params.use_gpu = false;
|
|
32
|
+
reasonNoMetal = @"Metal is not supported in this device";
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
#if TARGET_OS_SIMULATOR
|
|
36
|
+
// Use the backend, but no layers because not supported fully on simulator
|
|
37
|
+
ctx_params.use_gpu = false;
|
|
38
|
+
reasonNoMetal = @"Metal is not supported in simulator";
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
device = nil;
|
|
42
|
+
}
|
|
43
|
+
#endif // WSP_GGML_USE_METAL
|
|
44
|
+
|
|
45
|
+
// Initialize VAD context
|
|
46
|
+
context->vctx = whisper_vad_init_from_file_with_params([modelPath UTF8String], ctx_params);
|
|
47
|
+
|
|
48
|
+
if (context->vctx == NULL) {
|
|
49
|
+
NSLog(@"Failed to initialize VAD context from model: %@", modelPath);
|
|
50
|
+
return nil;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Check GPU status
|
|
54
|
+
context->isMetalEnabled = ctx_params.use_gpu;
|
|
55
|
+
context->reasonNoMetal = reasonNoMetal;
|
|
56
|
+
|
|
57
|
+
return context;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
- (bool)isMetalEnabled {
|
|
61
|
+
return isMetalEnabled;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
- (NSString *)reasonNoMetal {
|
|
65
|
+
return reasonNoMetal;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
- (struct whisper_vad_context *)getVadContext {
|
|
69
|
+
return vctx;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
- (dispatch_queue_t)getDispatchQueue {
|
|
73
|
+
return dQueue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
- (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options {
|
|
77
|
+
if (vctx == NULL) {
|
|
78
|
+
NSLog(@"VAD context is null");
|
|
79
|
+
return @[];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Convert NSData to float array
|
|
83
|
+
const float *samples = (const float *)[audioData bytes];
|
|
84
|
+
int n_samples = (int)[audioData length] / sizeof(float);
|
|
85
|
+
|
|
86
|
+
// Run VAD detection
|
|
87
|
+
bool speechDetected = whisper_vad_detect_speech(vctx, samples, n_samples);
|
|
88
|
+
if (!speechDetected) {
|
|
89
|
+
return @[];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Get VAD parameters
|
|
93
|
+
struct whisper_vad_params vad_params = whisper_vad_default_params();
|
|
94
|
+
|
|
95
|
+
if ([options objectForKey:@"threshold"]) {
|
|
96
|
+
vad_params.threshold = [[options objectForKey:@"threshold"] floatValue];
|
|
97
|
+
}
|
|
98
|
+
if ([options objectForKey:@"minSpeechDurationMs"]) {
|
|
99
|
+
vad_params.min_speech_duration_ms = [[options objectForKey:@"minSpeechDurationMs"] intValue];
|
|
100
|
+
}
|
|
101
|
+
if ([options objectForKey:@"minSilenceDurationMs"]) {
|
|
102
|
+
vad_params.min_silence_duration_ms = [[options objectForKey:@"minSilenceDurationMs"] intValue];
|
|
103
|
+
}
|
|
104
|
+
if ([options objectForKey:@"maxSpeechDurationS"]) {
|
|
105
|
+
vad_params.max_speech_duration_s = [[options objectForKey:@"maxSpeechDurationS"] floatValue];
|
|
106
|
+
}
|
|
107
|
+
if ([options objectForKey:@"speechPadMs"]) {
|
|
108
|
+
vad_params.speech_pad_ms = [[options objectForKey:@"speechPadMs"] intValue];
|
|
109
|
+
}
|
|
110
|
+
if ([options objectForKey:@"samplesOverlap"]) {
|
|
111
|
+
vad_params.samples_overlap = [[options objectForKey:@"samplesOverlap"] floatValue];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Get segments from VAD probabilities
|
|
115
|
+
struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, vad_params);
|
|
116
|
+
if (segments == NULL) {
|
|
117
|
+
return @[];
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Convert segments to NSArray
|
|
121
|
+
NSMutableArray *result = [[NSMutableArray alloc] init];
|
|
122
|
+
int n_segments = whisper_vad_segments_n_segments(segments);
|
|
123
|
+
|
|
124
|
+
for (int i = 0; i < n_segments; i++) {
|
|
125
|
+
float t0 = whisper_vad_segments_get_segment_t0(segments, i);
|
|
126
|
+
float t1 = whisper_vad_segments_get_segment_t1(segments, i);
|
|
127
|
+
|
|
128
|
+
NSDictionary *segment = @{
|
|
129
|
+
@"t0": @(t0),
|
|
130
|
+
@"t1": @(t1)
|
|
131
|
+
};
|
|
132
|
+
[result addObject:segment];
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Clean up
|
|
136
|
+
whisper_vad_free_segments(segments);
|
|
137
|
+
|
|
138
|
+
return result;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
- (void)invalidate {
|
|
142
|
+
if (vctx != NULL) {
|
|
143
|
+
whisper_vad_free(vctx);
|
|
144
|
+
vctx = NULL;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
- (void)dealloc {
|
|
149
|
+
[self invalidate];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
@end
|