whisper.rn 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -415,7 +415,8 @@ public class RNWhisper implements LifecycleEventListener {
415
415
  @Override
416
416
  protected WritableArray doInBackground(Void... voids) {
417
417
  try {
418
- return vadContext.detectSpeech(audioDataBase64, options);
418
+ float[] audioData = AudioUtils.decodePcmData(audioDataBase64);
419
+ return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
419
420
  } catch (Exception e) {
420
421
  exception = e;
421
422
  return null;
@@ -468,7 +469,7 @@ public class RNWhisper implements LifecycleEventListener {
468
469
  throw new Exception("Failed to load audio file: " + filePathOrBase64);
469
470
  }
470
471
 
471
- return vadContext.detectSpeechWithAudioData(audioData, options);
472
+ return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
472
473
  } catch (Exception e) {
473
474
  exception = e;
474
475
  return null;
@@ -25,70 +25,14 @@ public class WhisperVadContext {
25
25
  this.reactContext = reactContext;
26
26
  }
27
27
 
28
- public WritableArray detectSpeech(String audioDataBase64, ReadableMap options) throws Exception {
28
+ public WritableArray detectSpeechWithAudioData(float[] audioData, int numSamples, ReadableMap options) throws Exception {
29
29
  if (vadContext == 0) {
30
30
  throw new Exception("VAD context is null");
31
31
  }
32
32
 
33
- // Decode base64 audio data to float array
34
- byte[] audioBytes = Base64.decode(audioDataBase64, Base64.DEFAULT);
35
- int numSamples = audioBytes.length / 4; // 4 bytes per float
36
- float[] audioData = new float[numSamples];
37
-
38
- for (int i = 0; i < numSamples; i++) {
39
- int intBits = (audioBytes[i * 4] & 0xFF) |
40
- ((audioBytes[i * 4 + 1] & 0xFF) << 8) |
41
- ((audioBytes[i * 4 + 2] & 0xFF) << 16) |
42
- ((audioBytes[i * 4 + 3] & 0xFF) << 24);
43
- audioData[i] = Float.intBitsToFloat(intBits);
44
- }
45
-
46
33
  return processVadDetection(audioData, numSamples, options);
47
34
  }
48
35
 
49
- public WritableArray detectSpeechFile(String filePathOrBase64, ReadableMap options) throws Exception {
50
- if (vadContext == 0) {
51
- throw new Exception("VAD context is null");
52
- }
53
-
54
- // Follow the same pattern as transcribeFile
55
- String filePath = filePathOrBase64;
56
-
57
- // Handle HTTP downloads
58
- if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) {
59
- // Note: This would require access to the downloader, but for now we'll throw an error
60
- throw new Exception("HTTP URLs not supported in VAD file detection. Please download the file first.");
61
- }
62
-
63
- float[] audioData;
64
-
65
- // Check for resource identifier (bundled assets)
66
- int resId = getResourceIdentifier(filePath);
67
- if (resId > 0) {
68
- audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId));
69
- } else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) {
70
- // Handle base64 WAV data
71
- audioData = AudioUtils.decodeWaveData(filePathOrBase64);
72
- } else {
73
- // Handle regular file path
74
- audioData = AudioUtils.decodeWaveFile(new java.io.FileInputStream(new java.io.File(filePath)));
75
- }
76
-
77
- if (audioData == null) {
78
- throw new Exception("Failed to load audio file: " + filePathOrBase64);
79
- }
80
-
81
- return processVadDetection(audioData, audioData.length, options);
82
- }
83
-
84
- public WritableArray detectSpeechWithAudioData(float[] audioData, ReadableMap options) throws Exception {
85
- if (vadContext == 0) {
86
- throw new Exception("VAD context is null");
87
- }
88
-
89
- return processVadDetection(audioData, audioData.length, options);
90
- }
91
-
92
36
  private int getResourceIdentifier(String filePath) {
93
37
  int identifier = reactContext.getResources().getIdentifier(
94
38
  filePath,
package/ios/RNWhisper.mm CHANGED
@@ -507,13 +507,16 @@ RCT_REMAP_METHOD(vadDetectSpeech,
507
507
  }
508
508
 
509
509
  // Decode base64 audio data
510
- NSData *audioData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
511
- if (audioData == nil) {
510
+ NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:audioDataBase64 options:0];
511
+ if (pcmData == nil) {
512
512
  reject(@"whisper_vad_error", @"Invalid audio data", nil);
513
513
  return;
514
514
  }
515
515
 
516
- NSArray *segments = [vadContext detectSpeech:audioData options:options];
516
+ int count = 0;
517
+ float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
518
+
519
+ NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
517
520
  resolve(segments);
518
521
  }
519
522
 
@@ -549,10 +552,7 @@ RCT_REMAP_METHOD(vadDetectSpeechFile,
549
552
  return;
550
553
  }
551
554
 
552
- // Convert float32 data to NSData for VAD context
553
- NSData *audioData = [NSData dataWithBytes:data length:count * sizeof(float)];
554
-
555
- NSArray *segments = [vadContext detectSpeech:audioData options:options];
555
+ NSArray *segments = [vadContext detectSpeech:data samplesCount:count options:options];
556
556
  resolve(segments);
557
557
  }
558
558
 
@@ -23,7 +23,7 @@
23
23
  - (NSString *)reasonNoMetal;
24
24
  - (struct whisper_vad_context *)getVadContext;
25
25
  - (dispatch_queue_t)getDispatchQueue;
26
- - (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options;
26
+ - (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options;
27
27
  - (void)invalidate;
28
28
 
29
29
  @end
@@ -73,18 +73,14 @@
73
73
  return dQueue;
74
74
  }
75
75
 
76
- - (NSArray *)detectSpeech:(NSData *)audioData options:(NSDictionary *)options {
76
+ - (NSArray *)detectSpeech:(float *)samples samplesCount:(int)samplesCount options:(NSDictionary *)options {
77
77
  if (vctx == NULL) {
78
78
  NSLog(@"VAD context is null");
79
79
  return @[];
80
80
  }
81
81
 
82
- // Convert NSData to float array
83
- const float *samples = (const float *)[audioData bytes];
84
- int n_samples = (int)[audioData length] / sizeof(float);
85
-
86
82
  // Run VAD detection
87
- bool speechDetected = whisper_vad_detect_speech(vctx, samples, n_samples);
83
+ bool speechDetected = whisper_vad_detect_speech(vctx, samples, samplesCount);
88
84
  if (!speechDetected) {
89
85
  return @[];
90
86
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.rn",
3
- "version": "0.4.1",
3
+ "version": "0.4.2",
4
4
  "description": "React Native binding of whisper.cpp",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",