whisper.rn 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -0
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +212 -0
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +34 -4
- package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +101 -0
- package/android/src/main/jni.cpp +196 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
- package/ios/RNWhisper.mm +147 -0
- package/ios/RNWhisperContext.mm +18 -24
- package/ios/RNWhisperVadContext.h +29 -0
- package/ios/RNWhisperVadContext.mm +148 -0
- package/jest/mock.js +19 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +111 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +112 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +35 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +39 -3
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +48 -0
- package/src/index.ts +132 -1
|
Binary file
|
|
Binary file
|
|
@@ -77,6 +77,32 @@ public class RNWhisperModule extends NativeRNWhisperSpec {
|
|
|
77
77
|
rnwhisper.releaseAllContexts(promise);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
+
// VAD methods
|
|
81
|
+
@ReactMethod
|
|
82
|
+
public void initVadContext(final ReadableMap options, final Promise promise) {
|
|
83
|
+
rnwhisper.initVadContext(options, promise);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@ReactMethod
|
|
87
|
+
public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
|
|
88
|
+
rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@ReactMethod
|
|
92
|
+
public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
|
|
93
|
+
rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@ReactMethod
|
|
97
|
+
public void releaseVadContext(double id, Promise promise) {
|
|
98
|
+
rnwhisper.releaseVadContext(id, promise);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@ReactMethod
|
|
102
|
+
public void releaseAllVadContexts(Promise promise) {
|
|
103
|
+
rnwhisper.releaseAllVadContexts(promise);
|
|
104
|
+
}
|
|
105
|
+
|
|
80
106
|
/*
|
|
81
107
|
* iOS Specific methods, left here for make the turbo module happy:
|
|
82
108
|
*/
|
|
@@ -76,4 +76,30 @@ public class RNWhisperModule extends ReactContextBaseJavaModule {
|
|
|
76
76
|
public void releaseAllContexts(Promise promise) {
|
|
77
77
|
rnwhisper.releaseAllContexts(promise);
|
|
78
78
|
}
|
|
79
|
+
|
|
80
|
+
// VAD methods
|
|
81
|
+
@ReactMethod
|
|
82
|
+
public void initVadContext(final ReadableMap options, final Promise promise) {
|
|
83
|
+
rnwhisper.initVadContext(options, promise);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@ReactMethod
|
|
87
|
+
public void vadDetectSpeech(double id, String audioDataBase64, ReadableMap options, Promise promise) {
|
|
88
|
+
rnwhisper.vadDetectSpeech(id, audioDataBase64, options, promise);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@ReactMethod
|
|
92
|
+
public void vadDetectSpeechFile(double id, String filePath, ReadableMap options, Promise promise) {
|
|
93
|
+
rnwhisper.vadDetectSpeechFile(id, filePath, options, promise);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@ReactMethod
|
|
97
|
+
public void releaseVadContext(double id, Promise promise) {
|
|
98
|
+
rnwhisper.releaseVadContext(id, promise);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@ReactMethod
|
|
102
|
+
public void releaseAllVadContexts(Promise promise) {
|
|
103
|
+
rnwhisper.releaseAllVadContexts(promise);
|
|
104
|
+
}
|
|
79
105
|
}
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#import "RNWhisper.h"
|
|
2
2
|
#import "RNWhisperContext.h"
|
|
3
|
+
#import "RNWhisperVadContext.h"
|
|
3
4
|
#import "RNWhisperDownloader.h"
|
|
4
5
|
#import "RNWhisperAudioUtils.h"
|
|
5
6
|
#import "RNWhisperAudioSessionUtils.h"
|
|
@@ -13,6 +14,7 @@
|
|
|
13
14
|
@implementation RNWhisper
|
|
14
15
|
|
|
15
16
|
NSMutableDictionary *contexts;
|
|
17
|
+
NSMutableDictionary *vadContexts;
|
|
16
18
|
|
|
17
19
|
RCT_EXPORT_MODULE()
|
|
18
20
|
|
|
@@ -366,6 +368,15 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
366
368
|
[context invalidate];
|
|
367
369
|
}
|
|
368
370
|
|
|
371
|
+
if (vadContexts != nil) {
|
|
372
|
+
for (NSNumber *contextId in vadContexts) {
|
|
373
|
+
RNWhisperVadContext *vadContext = vadContexts[contextId];
|
|
374
|
+
[vadContext invalidate];
|
|
375
|
+
}
|
|
376
|
+
[vadContexts removeAllObjects];
|
|
377
|
+
vadContexts = nil;
|
|
378
|
+
}
|
|
379
|
+
|
|
369
380
|
rnwhisper::job_abort_all(); // graceful abort
|
|
370
381
|
|
|
371
382
|
[contexts removeAllObjects];
|
|
@@ -437,6 +448,142 @@ RCT_REMAP_METHOD(setAudioSessionActive,
|
|
|
437
448
|
resolve(nil);
|
|
438
449
|
}
|
|
439
450
|
|
|
451
|
+
RCT_REMAP_METHOD(initVadContext,
|
|
452
|
+
withVadOptions:(NSDictionary *)vadOptions
|
|
453
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
454
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
455
|
+
{
|
|
456
|
+
if (vadContexts == nil) {
|
|
457
|
+
vadContexts = [[NSMutableDictionary alloc] init];
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
NSString *modelPath = [vadOptions objectForKey:@"filePath"];
|
|
461
|
+
BOOL isBundleAsset = [[vadOptions objectForKey:@"isBundleAsset"] boolValue];
|
|
462
|
+
BOOL useGpu = [[vadOptions objectForKey:@"useGpu"] boolValue];
|
|
463
|
+
NSNumber *nThreads = [vadOptions objectForKey:@"nThreads"];
|
|
464
|
+
|
|
465
|
+
NSString *path = modelPath;
|
|
466
|
+
if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
|
|
467
|
+
path = [RNWhisperDownloader downloadFile:path toFile:nil];
|
|
468
|
+
}
|
|
469
|
+
if (isBundleAsset) {
|
|
470
|
+
path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
int contextId = arc4random_uniform(1000000);
|
|
474
|
+
|
|
475
|
+
RNWhisperVadContext *vadContext = [RNWhisperVadContext
|
|
476
|
+
initWithModelPath:path
|
|
477
|
+
contextId:contextId
|
|
478
|
+
noMetal:!useGpu
|
|
479
|
+
nThreads:nThreads
|
|
480
|
+
];
|
|
481
|
+
if ([vadContext getVadContext] == NULL) {
|
|
482
|
+
reject(@"whisper_vad_error", @"Failed to load the VAD model", nil);
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
[vadContexts setObject:vadContext forKey:[NSNumber numberWithInt:contextId]];
|
|
487
|
+
|
|
488
|
+
resolve(@{
|
|
489
|
+
@"contextId": @(contextId),
|
|
490
|
+
@"gpu": @([vadContext isMetalEnabled]),
|
|
491
|
+
@"reasonNoGPU": [vadContext reasonNoMetal],
|
|
492
|
+
});
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
RCT_REMAP_METHOD(vadDetectSpeech,
|
|
496
|
+
withContextId:(int)contextId
|
|
497
|
+
withAudioData:(NSString *)audioDataBase64
|
|
498
|
+
withOptions:(NSDictionary *)options
|
|
499
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
500
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
501
|
+
{
|
|
502
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
503
|
+
|
|
504
|
+
if (vadContext == nil) {
|
|
505
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
// Decode base64 audio data
|
|
510
|
+
NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
|
|
511
|
+
if (pcmData == nil) {
|
|
512
|
+
reject(@"whisper_vad_error", @"Invalid audio data", nil);
|
|
513
|
+
return;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
int count = 0;
|
|
517
|
+
float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
|
|
518
|
+
|
|
519
|
+
NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
|
|
520
|
+
resolve(segments);
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
RCT_REMAP_METHOD(vadDetectSpeechFile,
|
|
524
|
+
withVadContextId:(int)contextId
|
|
525
|
+
withFilePath:(NSString *)filePath
|
|
526
|
+
withOptions:(NSDictionary *)options
|
|
527
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
528
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
529
|
+
{
|
|
530
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
531
|
+
|
|
532
|
+
if (vadContext == nil) {
|
|
533
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
534
|
+
return;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
// Handle different input types like transcribeFile does
|
|
538
|
+
float *data = nil;
|
|
539
|
+
int count = 0;
|
|
540
|
+
if ([filePath hasPrefix:@"http://"] || [filePath hasPrefix:@"https://"]) {
|
|
541
|
+
NSString *path = [RNWhisperDownloader downloadFile:filePath toFile:nil];
|
|
542
|
+
data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
543
|
+
} else if ([filePath hasPrefix:@"data:audio/wav;base64,"]) {
|
|
544
|
+
NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[filePath substringFromIndex:22] options:0];
|
|
545
|
+
data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
|
|
546
|
+
} else {
|
|
547
|
+
data = [RNWhisperAudioUtils decodeWaveFile:filePath count:&count];
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
if (data == nil) {
|
|
551
|
+
reject(@"whisper_vad_error", @"Failed to load or decode audio file", nil);
|
|
552
|
+
return;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
|
|
556
|
+
resolve(segments);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
RCT_REMAP_METHOD(releaseVadContext,
|
|
560
|
+
withVadContextId:(int)contextId
|
|
561
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
562
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
563
|
+
{
|
|
564
|
+
RNWhisperVadContext *vadContext = vadContexts[[NSNumber numberWithInt:contextId]];
|
|
565
|
+
if (vadContext == nil) {
|
|
566
|
+
reject(@"whisper_vad_error", @"VAD context not found", nil);
|
|
567
|
+
return;
|
|
568
|
+
}
|
|
569
|
+
[vadContext invalidate];
|
|
570
|
+
[vadContexts removeObjectForKey:[NSNumber numberWithInt:contextId]];
|
|
571
|
+
resolve(nil);
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
RCT_EXPORT_METHOD(releaseAllVadContexts:(RCTPromiseResolveBlock)resolve
|
|
575
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
576
|
+
{
|
|
577
|
+
if (vadContexts != nil) {
|
|
578
|
+
for (NSNumber *contextId in vadContexts) {
|
|
579
|
+
RNWhisperVadContext *vadContext = vadContexts[contextId];
|
|
580
|
+
[vadContext invalidate];
|
|
581
|
+
}
|
|
582
|
+
[vadContexts removeAllObjects];
|
|
583
|
+
}
|
|
584
|
+
resolve(nil);
|
|
585
|
+
}
|
|
586
|
+
|
|
440
587
|
#ifdef RCT_NEW_ARCH_ENABLED
|
|
441
588
|
- (std::shared_ptr<facebook::react::TurboModule>)getTurboModule:
|
|
442
589
|
(const facebook::react::ObjCTurboModule::InitParams &)params
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -36,36 +36,30 @@
|
|
|
36
36
|
NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
|
|
37
37
|
cparams.use_gpu = false;
|
|
38
38
|
}
|
|
39
|
+
reasonNoMetal = @"Metal is not enabled in this build";
|
|
39
40
|
#endif
|
|
40
41
|
|
|
41
42
|
#ifdef WSP_GGML_USE_METAL
|
|
42
43
|
if (cparams.use_gpu) {
|
|
43
|
-
#if TARGET_OS_SIMULATOR
|
|
44
|
-
NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
|
|
45
|
-
cparams.use_gpu = false;
|
|
46
|
-
#else // TARGET_OS_SIMULATOR
|
|
47
|
-
// Check ggml-metal availability
|
|
48
|
-
NSError * error = nil;
|
|
49
44
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
error:&error
|
|
56
|
-
];
|
|
57
|
-
if (error) {
|
|
58
|
-
reasonNoMetal = [error localizedDescription];
|
|
59
|
-
} else {
|
|
60
|
-
id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
|
|
61
|
-
id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
|
|
62
|
-
if (pipeline == nil) {
|
|
63
|
-
reasonNoMetal = [error localizedDescription];
|
|
64
|
-
NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
|
|
65
|
-
cparams.use_gpu = false;
|
|
66
|
-
}
|
|
45
|
+
|
|
46
|
+
// Check ggml-metal availability
|
|
47
|
+
BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
|
48
|
+
if (@available(iOS 16.0, tvOS 16.0, *)) {
|
|
49
|
+
supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
|
67
50
|
}
|
|
68
|
-
|
|
51
|
+
if (!supportsGgmlMetal) {
|
|
52
|
+
cparams.use_gpu = false;
|
|
53
|
+
reasonNoMetal = @"Metal is not supported in this device";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#if TARGET_OS_SIMULATOR
|
|
57
|
+
// Use the backend, but no layers because not supported fully on simulator
|
|
58
|
+
cparams.use_gpu = false;
|
|
59
|
+
reasonNoMetal = @"Metal is not supported in simulator";
|
|
60
|
+
#endif
|
|
61
|
+
|
|
62
|
+
device = nil;
|
|
69
63
|
}
|
|
70
64
|
#endif // WSP_GGML_USE_METAL
|
|
71
65
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#ifdef __cplusplus
|
|
2
|
+
#if RNWHISPER_BUILD_FROM_SOURCE
|
|
3
|
+
#import "whisper.h"
|
|
4
|
+
#import "rn-whisper.h"
|
|
5
|
+
#else
|
|
6
|
+
#import <rnwhisper/whisper.h>
|
|
7
|
+
#import <rnwhisper/rn-whisper.h>
|
|
8
|
+
#endif
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#import <Foundation/Foundation.h>
|
|
12
|
+
|
|
13
|
+
@interface RNWhisperVadContext : NSObject {
|
|
14
|
+
int contextId;
|
|
15
|
+
dispatch_queue_t dQueue;
|
|
16
|
+
struct whisper_vad_context * vctx;
|
|
17
|
+
NSString * reasonNoMetal;
|
|
18
|
+
bool isMetalEnabled;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads;
|
|
22
|
+
- (bool)isMetalEnabled;
|
|
23
|
+
- (NSString *)reasonNoMetal;
|
|
24
|
+
- (struct whisper_vad_context *)getVadContext;
|
|
25
|
+
- (dispatch_queue_t)getDispatchQueue;
|
|
26
|
+
- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options;
|
|
27
|
+
- (void)invalidate;
|
|
28
|
+
|
|
29
|
+
@end
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#import "RNWhisperVadContext.h"
|
|
2
|
+
#import "RNWhisperAudioUtils.h"
|
|
3
|
+
#import <Metal/Metal.h>
|
|
4
|
+
|
|
5
|
+
@implementation RNWhisperVadContext
|
|
6
|
+
|
|
7
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noMetal:(BOOL)noMetal nThreads:(NSNumber *)nThreads {
|
|
8
|
+
RNWhisperVadContext *context = [[RNWhisperVadContext alloc] init];
|
|
9
|
+
|
|
10
|
+
context->contextId = contextId;
|
|
11
|
+
context->dQueue = dispatch_queue_create("rnwhisper.vad.serial_queue", DISPATCH_QUEUE_SERIAL);
|
|
12
|
+
NSString *reasonNoMetal = @"";
|
|
13
|
+
|
|
14
|
+
// Set up VAD context parameters
|
|
15
|
+
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
|
|
16
|
+
ctx_params.use_gpu = !noMetal;
|
|
17
|
+
if (nThreads != nil) {
|
|
18
|
+
ctx_params.n_threads = [nThreads intValue];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
#ifdef WSP_GGML_USE_METAL
|
|
22
|
+
if (ctx_params.use_gpu) {
|
|
23
|
+
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
|
24
|
+
|
|
25
|
+
// Check ggml-metal availability
|
|
26
|
+
BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
|
|
27
|
+
if (@available(iOS 16.0, tvOS 16.0, *)) {
|
|
28
|
+
supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
|
|
29
|
+
}
|
|
30
|
+
if (!supportsGgmlMetal) {
|
|
31
|
+
ctx_params.use_gpu = false;
|
|
32
|
+
reasonNoMetal = @"Metal is not supported in this device";
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
#if TARGET_OS_SIMULATOR
|
|
36
|
+
// Use the backend, but no layers because not supported fully on simulator
|
|
37
|
+
ctx_params.use_gpu = false;
|
|
38
|
+
reasonNoMetal = @"Metal is not supported in simulator";
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
device = nil;
|
|
42
|
+
}
|
|
43
|
+
#endif // WSP_GGML_USE_METAL
|
|
44
|
+
|
|
45
|
+
// Initialize VAD context
|
|
46
|
+
context->vctx = whisper_vad_init_from_file_with_params([modelPath UTF8String], ctx_params);
|
|
47
|
+
|
|
48
|
+
if (context->vctx == NULL) {
|
|
49
|
+
NSLog(@"Failed to initialize VAD context from model: %@", modelPath);
|
|
50
|
+
return nil;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Check GPU status
|
|
54
|
+
context->isMetalEnabled = ctx_params.use_gpu;
|
|
55
|
+
context->reasonNoMetal = reasonNoMetal;
|
|
56
|
+
|
|
57
|
+
return context;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
- (bool)isMetalEnabled {
|
|
61
|
+
return isMetalEnabled;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
- (NSString *)reasonNoMetal {
|
|
65
|
+
return reasonNoMetal;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
- (struct whisper_vad_context *)getVadContext {
|
|
69
|
+
return vctx;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
- (dispatch_queue_t)getDispatchQueue {
|
|
73
|
+
return dQueue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options {
|
|
77
|
+
if (vctx == NULL) {
|
|
78
|
+
NSLog(@"VAD context is null");
|
|
79
|
+
return @[];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Run VAD detection
|
|
83
|
+
bool speechDetected = whisper_vad_detect_speech(vctx, samples, samplesCount);
|
|
84
|
+
if (!speechDetected) {
|
|
85
|
+
return @[];
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Get VAD parameters
|
|
89
|
+
struct whisper_vad_params vad_params = whisper_vad_default_params();
|
|
90
|
+
|
|
91
|
+
if ([options objectForKey:@"threshold"]) {
|
|
92
|
+
vad_params.threshold = [[options objectForKey:@"threshold"] floatValue];
|
|
93
|
+
}
|
|
94
|
+
if ([options objectForKey:@"minSpeechDurationMs"]) {
|
|
95
|
+
vad_params.min_speech_duration_ms = [[options objectForKey:@"minSpeechDurationMs"] intValue];
|
|
96
|
+
}
|
|
97
|
+
if ([options objectForKey:@"minSilenceDurationMs"]) {
|
|
98
|
+
vad_params.min_silence_duration_ms = [[options objectForKey:@"minSilenceDurationMs"] intValue];
|
|
99
|
+
}
|
|
100
|
+
if ([options objectForKey:@"maxSpeechDurationS"]) {
|
|
101
|
+
vad_params.max_speech_duration_s = [[options objectForKey:@"maxSpeechDurationS"] floatValue];
|
|
102
|
+
}
|
|
103
|
+
if ([options objectForKey:@"speechPadMs"]) {
|
|
104
|
+
vad_params.speech_pad_ms = [[options objectForKey:@"speechPadMs"] intValue];
|
|
105
|
+
}
|
|
106
|
+
if ([options objectForKey:@"samplesOverlap"]) {
|
|
107
|
+
vad_params.samples_overlap = [[options objectForKey:@"samplesOverlap"] floatValue];
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Get segments from VAD probabilities
|
|
111
|
+
struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, vad_params);
|
|
112
|
+
if (segments == NULL) {
|
|
113
|
+
return @[];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Convert segments to NSArray
|
|
117
|
+
NSMutableArray *result = [[NSMutableArray alloc] init];
|
|
118
|
+
int n_segments = whisper_vad_segments_n_segments(segments);
|
|
119
|
+
|
|
120
|
+
for (int i = 0; i < n_segments; i++) {
|
|
121
|
+
float t0 = whisper_vad_segments_get_segment_t0(segments, i);
|
|
122
|
+
float t1 = whisper_vad_segments_get_segment_t1(segments, i);
|
|
123
|
+
|
|
124
|
+
NSDictionary *segment = @{
|
|
125
|
+
@"t0": @(t0),
|
|
126
|
+
@"t1": @(t1)
|
|
127
|
+
};
|
|
128
|
+
[result addObject:segment];
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Clean up
|
|
132
|
+
whisper_vad_free_segments(segments);
|
|
133
|
+
|
|
134
|
+
return result;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
- (void)invalidate {
|
|
138
|
+
if (vctx != NULL) {
|
|
139
|
+
whisper_vad_free(vctx);
|
|
140
|
+
vctx = NULL;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
- (void)dealloc {
|
|
145
|
+
[self invalidate];
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
@end
|
package/jest/mock.js
CHANGED
|
@@ -61,6 +61,25 @@ if (!NativeModules.RNWhisper) {
|
|
|
61
61
|
releaseContext: jest.fn(() => Promise.resolve()),
|
|
62
62
|
releaseAllContexts: jest.fn(() => Promise.resolve()),
|
|
63
63
|
|
|
64
|
+
// VAD methods
|
|
65
|
+
initVadContext: jest.fn(() => Promise.resolve({
|
|
66
|
+
contextId: 2,
|
|
67
|
+
gpu: false,
|
|
68
|
+
reasonNoGPU: 'Mock VAD context'
|
|
69
|
+
})),
|
|
70
|
+
vadDetectSpeech: jest.fn().mockResolvedValue([
|
|
71
|
+
{ t0: 0.5, t1: 2.3 },
|
|
72
|
+
{ t0: 3.1, t1: 5.8 },
|
|
73
|
+
{ t0: 7.2, t1: 9.4 }
|
|
74
|
+
]),
|
|
75
|
+
vadDetectSpeechFile: jest.fn().mockResolvedValue([
|
|
76
|
+
{ t0: 0.5, t1: 2.3 },
|
|
77
|
+
{ t0: 3.1, t1: 5.8 },
|
|
78
|
+
{ t0: 7.2, t1: 9.4 }
|
|
79
|
+
]),
|
|
80
|
+
releaseVadContext: jest.fn(() => Promise.resolve()),
|
|
81
|
+
releaseAllVadContexts: jest.fn(() => Promise.resolve()),
|
|
82
|
+
|
|
64
83
|
// iOS AudioSession utils
|
|
65
84
|
getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
|
|
66
85
|
category: 'AVAudioSessionCategoryPlayAndRecord',
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,
|
|
1
|
+
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0JnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
|
package/lib/commonjs/index.js
CHANGED
|
@@ -9,10 +9,12 @@ Object.defineProperty(exports, "AudioSessionIos", {
|
|
|
9
9
|
return _AudioSessionIos.default;
|
|
10
10
|
}
|
|
11
11
|
});
|
|
12
|
-
exports.WhisperContext = void 0;
|
|
12
|
+
exports.WhisperVadContext = exports.WhisperContext = void 0;
|
|
13
13
|
exports.initWhisper = initWhisper;
|
|
14
|
+
exports.initWhisperVad = initWhisperVad;
|
|
14
15
|
exports.libVersion = exports.isUseCoreML = exports.isCoreMLAllowFallback = void 0;
|
|
15
16
|
exports.releaseAllWhisper = releaseAllWhisper;
|
|
17
|
+
exports.releaseAllWhisperVad = releaseAllWhisperVad;
|
|
16
18
|
var _reactNative = require("react-native");
|
|
17
19
|
var _NativeRNWhisper = _interopRequireDefault(require("./NativeRNWhisper"));
|
|
18
20
|
var _AudioSessionIos = _interopRequireDefault(require("./AudioSessionIos"));
|
|
@@ -386,4 +388,112 @@ const isUseCoreML = !!useCoreML;
|
|
|
386
388
|
exports.isUseCoreML = isUseCoreML;
|
|
387
389
|
const isCoreMLAllowFallback = !!coreMLAllowFallback;
|
|
388
390
|
exports.isCoreMLAllowFallback = isCoreMLAllowFallback;
|
|
391
|
+
class WhisperVadContext {
|
|
392
|
+
gpu = false;
|
|
393
|
+
reasonNoGPU = '';
|
|
394
|
+
constructor(_ref3) {
|
|
395
|
+
let {
|
|
396
|
+
contextId,
|
|
397
|
+
gpu,
|
|
398
|
+
reasonNoGPU
|
|
399
|
+
} = _ref3;
|
|
400
|
+
this.id = contextId;
|
|
401
|
+
this.gpu = gpu;
|
|
402
|
+
this.reasonNoGPU = reasonNoGPU;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/**
|
|
406
|
+
* Detect speech segments in audio file (path or base64 encoded wav file)
|
|
407
|
+
* base64: need add `data:audio/wav;base64,` prefix
|
|
408
|
+
*/
|
|
409
|
+
async detectSpeech(filePathOrBase64) {
|
|
410
|
+
let options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
|
|
411
|
+
let path = '';
|
|
412
|
+
if (typeof filePathOrBase64 === 'number') {
|
|
413
|
+
try {
|
|
414
|
+
const source = _reactNative.Image.resolveAssetSource(filePathOrBase64);
|
|
415
|
+
if (source) path = source.uri;
|
|
416
|
+
} catch (e) {
|
|
417
|
+
throw new Error(`Invalid asset: ${filePathOrBase64}`);
|
|
418
|
+
}
|
|
419
|
+
} else {
|
|
420
|
+
if (filePathOrBase64.startsWith('http')) throw new Error('VAD remote file is not supported, please download it first');
|
|
421
|
+
path = filePathOrBase64;
|
|
422
|
+
}
|
|
423
|
+
if (path.startsWith('file://')) path = path.slice(7);
|
|
424
|
+
|
|
425
|
+
// Check if this is base64 encoded audio data
|
|
426
|
+
if (path.startsWith('data:audio/')) {
|
|
427
|
+
// This is base64 encoded audio data, use the raw data method
|
|
428
|
+
return _NativeRNWhisper.default.vadDetectSpeech(this.id, path, options);
|
|
429
|
+
} else {
|
|
430
|
+
// This is a file path, use the file method
|
|
431
|
+
return _NativeRNWhisper.default.vadDetectSpeechFile(this.id, path, options);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* Detect speech segments in raw audio data (base64 encoded float32 PCM data)
|
|
437
|
+
*/
|
|
438
|
+
async detectSpeechData(audioData) {
|
|
439
|
+
let options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
|
|
440
|
+
return _NativeRNWhisper.default.vadDetectSpeech(this.id, audioData, options);
|
|
441
|
+
}
|
|
442
|
+
async release() {
|
|
443
|
+
return _NativeRNWhisper.default.releaseVadContext(this.id);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/**
|
|
448
|
+
* Initialize a VAD context for voice activity detection
|
|
449
|
+
* @param options VAD context options
|
|
450
|
+
* @returns Promise resolving to WhisperVadContext instance
|
|
451
|
+
*/
|
|
452
|
+
exports.WhisperVadContext = WhisperVadContext;
|
|
453
|
+
async function initWhisperVad(_ref4) {
|
|
454
|
+
let {
|
|
455
|
+
filePath,
|
|
456
|
+
isBundleAsset,
|
|
457
|
+
useGpu = true,
|
|
458
|
+
nThreads
|
|
459
|
+
} = _ref4;
|
|
460
|
+
let path = '';
|
|
461
|
+
if (typeof filePath === 'number') {
|
|
462
|
+
try {
|
|
463
|
+
const source = _reactNative.Image.resolveAssetSource(filePath);
|
|
464
|
+
if (source) {
|
|
465
|
+
path = source.uri;
|
|
466
|
+
}
|
|
467
|
+
} catch (e) {
|
|
468
|
+
throw new Error(`Invalid asset: ${filePath}`);
|
|
469
|
+
}
|
|
470
|
+
} else {
|
|
471
|
+
if (!isBundleAsset && filePath.startsWith('http')) throw new Error('VAD remote file is not supported, please download it first');
|
|
472
|
+
path = filePath;
|
|
473
|
+
}
|
|
474
|
+
if (path.startsWith('file://')) path = path.slice(7);
|
|
475
|
+
const {
|
|
476
|
+
contextId,
|
|
477
|
+
gpu,
|
|
478
|
+
reasonNoGPU
|
|
479
|
+
} = await _NativeRNWhisper.default.initVadContext({
|
|
480
|
+
filePath: path,
|
|
481
|
+
isBundleAsset: !!isBundleAsset,
|
|
482
|
+
useGpu,
|
|
483
|
+
nThreads
|
|
484
|
+
});
|
|
485
|
+
return new WhisperVadContext({
|
|
486
|
+
contextId,
|
|
487
|
+
gpu,
|
|
488
|
+
reasonNoGPU
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
/**
|
|
493
|
+
* Release all VAD contexts and free their memory
|
|
494
|
+
* @returns Promise resolving when all contexts are released
|
|
495
|
+
*/
|
|
496
|
+
async function releaseAllWhisperVad() {
|
|
497
|
+
return _NativeRNWhisper.default.releaseAllVadContexts();
|
|
498
|
+
}
|
|
389
499
|
//# sourceMappingURL=index.js.map
|