whisper.rn 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -415,7 +415,8 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
415
415
|
@Override
|
|
416
416
|
protected WritableArray doInBackground(Void... voids) {
|
|
417
417
|
try {
|
|
418
|
-
|
|
418
|
+
float[] audioData = AudioUtils.decodePcmData(audioDataBase64);
|
|
419
|
+
return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
|
|
419
420
|
} catch (Exception e) {
|
|
420
421
|
exception = e;
|
|
421
422
|
return null;
|
|
@@ -468,7 +469,7 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
468
469
|
throw new Exception("Failed to load audio file: " + filePathOrBase64);
|
|
469
470
|
}
|
|
470
471
|
|
|
471
|
-
return vadContext.detectSpeechWithAudioData(audioData, options);
|
|
472
|
+
return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
|
|
472
473
|
} catch (Exception e) {
|
|
473
474
|
exception = e;
|
|
474
475
|
return null;
|
|
@@ -25,70 +25,14 @@ public class WhisperVadContext {
|
|
|
25
25
|
this.reactContext = reactContext;
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
-
public WritableArray
|
|
28
|
+
public WritableArray detectSpeechWithAudioData(float[] audioData, int numSamples, ReadableMap options) throws Exception {
|
|
29
29
|
if (vadContext == 0) {
|
|
30
30
|
throw new Exception("VAD context is null");
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
// Decode base64 audio data to float array
|
|
34
|
-
byte[] audioBytes = Base64.decode(audioDataBase64, Base64.DEFAULT);
|
|
35
|
-
int numSamples = audioBytes.length / 4; // 4 bytes per float
|
|
36
|
-
float[] audioData = new float[numSamples];
|
|
37
|
-
|
|
38
|
-
for (int i = 0; i < numSamples; i++) {
|
|
39
|
-
int intBits = (audioBytes[i * 4] & 0xFF) |
|
|
40
|
-
((audioBytes[i * 4 + 1] & 0xFF) << 8) |
|
|
41
|
-
((audioBytes[i * 4 + 2] & 0xFF) << 16) |
|
|
42
|
-
((audioBytes[i * 4 + 3] & 0xFF) << 24);
|
|
43
|
-
audioData[i] = Float.intBitsToFloat(intBits);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
33
|
return processVadDetection(audioData, numSamples, options);
|
|
47
34
|
}
|
|
48
35
|
|
|
49
|
-
public WritableArray detectSpeechFile(String filePathOrBase64, ReadableMap options) throws Exception {
|
|
50
|
-
if (vadContext == 0) {
|
|
51
|
-
throw new Exception("VAD context is null");
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// Follow the same pattern as transcribeFile
|
|
55
|
-
String filePath = filePathOrBase64;
|
|
56
|
-
|
|
57
|
-
// Handle HTTP downloads
|
|
58
|
-
if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) {
|
|
59
|
-
// Note: This would require access to the downloader, but for now we'll throw an error
|
|
60
|
-
throw new Exception("HTTP URLs not supported in VAD file detection. Please download the file first.");
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
float[] audioData;
|
|
64
|
-
|
|
65
|
-
// Check for resource identifier (bundled assets)
|
|
66
|
-
int resId = getResourceIdentifier(filePath);
|
|
67
|
-
if (resId > 0) {
|
|
68
|
-
audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId));
|
|
69
|
-
} else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) {
|
|
70
|
-
// Handle base64 WAV data
|
|
71
|
-
audioData = AudioUtils.decodeWaveData(filePathOrBase64);
|
|
72
|
-
} else {
|
|
73
|
-
// Handle regular file path
|
|
74
|
-
audioData = AudioUtils.decodeWaveFile(new java.io.FileInputStream(new java.io.File(filePath)));
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if (audioData == null) {
|
|
78
|
-
throw new Exception("Failed to load audio file: " + filePathOrBase64);
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
return processVadDetection(audioData, audioData.length, options);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
public WritableArray detectSpeechWithAudioData(float[] audioData, ReadableMap options) throws Exception {
|
|
85
|
-
if (vadContext == 0) {
|
|
86
|
-
throw new Exception("VAD context is null");
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
return processVadDetection(audioData, audioData.length, options);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
36
|
private int getResourceIdentifier(String filePath) {
|
|
93
37
|
int identifier = reactContext.getResources().getIdentifier(
|
|
94
38
|
filePath,
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -507,13 +507,16 @@ RCT_REMAP_METHOD(vadDetectSpeech,
|
|
|
507
507
|
}
|
|
508
508
|
|
|
509
509
|
// Decode base64 audio data
|
|
510
|
-
NSData *
|
|
511
|
-
if (
|
|
510
|
+
NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
|
|
511
|
+
if (pcmData == nil) {
|
|
512
512
|
reject(@"whisper_vad_error", @"Invalid audio data", nil);
|
|
513
513
|
return;
|
|
514
514
|
}
|
|
515
515
|
|
|
516
|
-
|
|
516
|
+
int count = 0;
|
|
517
|
+
float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
|
|
518
|
+
|
|
519
|
+
NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
|
|
517
520
|
resolve(segments);
|
|
518
521
|
}
|
|
519
522
|
|
|
@@ -549,10 +552,7 @@ RCT_REMAP_METHOD(vadDetectSpeechFile,
|
|
|
549
552
|
return;
|
|
550
553
|
}
|
|
551
554
|
|
|
552
|
-
|
|
553
|
-
NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
|
|
554
|
-
|
|
555
|
-
NSArray *segments = [vadContext detectSpeech:audioData options:options];
|
|
555
|
+
NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
|
|
556
556
|
resolve(segments);
|
|
557
557
|
}
|
|
558
558
|
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
- (NSString *)reasonNoMetal;
|
|
24
24
|
- (struct whisper_vad_context *)getVadContext;
|
|
25
25
|
- (dispatch_queue_t)getDispatchQueue;
|
|
26
|
-
- (NSArray *)detectSpeech:(
|
|
26
|
+
- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options;
|
|
27
27
|
- (void)invalidate;
|
|
28
28
|
|
|
29
29
|
@end
|
|
@@ -73,18 +73,14 @@
|
|
|
73
73
|
return dQueue;
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
-
- (NSArray *)detectSpeech:(
|
|
76
|
+
- (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options {
|
|
77
77
|
if (vctx == NULL) {
|
|
78
78
|
NSLog(@"VAD context is null");
|
|
79
79
|
return @[];
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
// Convert NSData to float array
|
|
83
|
-
const float *samples = (const float *)[audioData bytes];
|
|
84
|
-
int n_samples = (int)[audioData length] / sizeof(float);
|
|
85
|
-
|
|
86
82
|
// Run VAD detection
|
|
87
|
-
bool speechDetected = whisper_vad_detect_speech(vctx, samples,
|
|
83
|
+
bool speechDetected = whisper_vad_detect_speech(vctx, samples, samplesCount);
|
|
88
84
|
if (!speechDetected) {
|
|
89
85
|
return @[];
|
|
90
86
|
}
|