whisper.rn 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +28 -0
  2. package/android/src/main/java/com/rnwhisper/AudioUtils.java +119 -0
  3. package/android/src/main/java/com/rnwhisper/WhisperContext.java +74 -39
  4. package/android/src/main/jni.cpp +45 -12
  5. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +26 -0
  6. package/cpp/rn-whisper.cpp +51 -0
  7. package/cpp/rn-whisper.h +2 -1
  8. package/ios/RNWhisper.mm +81 -22
  9. package/ios/RNWhisper.xcodeproj/project.pbxproj +27 -3
  10. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  11. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +5 -0
  12. package/ios/RNWhisperAudioSessionUtils.h +13 -0
  13. package/ios/RNWhisperAudioSessionUtils.m +85 -0
  14. package/ios/RNWhisperAudioUtils.h +9 -0
  15. package/ios/RNWhisperAudioUtils.m +83 -0
  16. package/ios/RNWhisperContext.h +1 -0
  17. package/ios/RNWhisperContext.mm +101 -28
  18. package/lib/commonjs/AudioSessionIos.js +91 -0
  19. package/lib/commonjs/AudioSessionIos.js.map +1 -0
  20. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  21. package/lib/commonjs/index.js +82 -14
  22. package/lib/commonjs/index.js.map +1 -1
  23. package/lib/module/AudioSessionIos.js +83 -0
  24. package/lib/module/AudioSessionIos.js.map +1 -0
  25. package/lib/module/NativeRNWhisper.js.map +1 -1
  26. package/lib/module/index.js +77 -14
  27. package/lib/module/index.js.map +1 -1
  28. package/lib/typescript/AudioSessionIos.d.ts +54 -0
  29. package/lib/typescript/AudioSessionIos.d.ts.map +1 -0
  30. package/lib/typescript/NativeRNWhisper.d.ts +8 -0
  31. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  32. package/lib/typescript/index.d.ts +62 -4
  33. package/lib/typescript/index.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/AudioSessionIos.ts +90 -0
  36. package/src/NativeRNWhisper.ts +11 -1
  37. package/src/index.ts +178 -28
package/ios/RNWhisper.mm CHANGED
@@ -1,6 +1,8 @@
1
1
  #import "RNWhisper.h"
2
2
  #import "RNWhisperContext.h"
3
3
  #import "RNWhisperDownloader.h"
4
+ #import "RNWhisperAudioUtils.h"
5
+ #import "RNWhisperAudioSessionUtils.h"
4
6
  #include <stdlib.h>
5
7
  #include <string>
6
8
 
@@ -87,6 +89,7 @@ RCT_REMAP_METHOD(initContext,
87
89
  - (NSArray *)supportedEvents {
88
90
  return@[
89
91
  @"@RNWhisper_onTranscribeProgress",
92
+ @"@RNWhisper_onTranscribeNewSegments",
90
93
  @"@RNWhisper_onRealtimeTranscribe",
91
94
  @"@RNWhisper_onRealtimeTranscribeEnd",
92
95
  ];
@@ -121,7 +124,7 @@ RCT_REMAP_METHOD(transcribeFile,
121
124
  }
122
125
 
123
126
  int count = 0;
124
- float *waveFile = [self decodeWaveFile:path count:&count];
127
+ float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
125
128
  if (waveFile == nil) {
126
129
  reject(@"whisper_error", @"Invalid file", nil);
127
130
  return;
@@ -144,6 +147,20 @@ RCT_REMAP_METHOD(transcribeFile,
144
147
  ];
145
148
  });
146
149
  }
150
+ onNewSegments: ^(NSDictionary *result) {
151
+ if (rn_whisper_transcribe_is_aborted(jobId)) {
152
+ return;
153
+ }
154
+ dispatch_async(dispatch_get_main_queue(), ^{
155
+ [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
156
+ body:@{
157
+ @"contextId": [NSNumber numberWithInt:contextId],
158
+ @"jobId": [NSNumber numberWithInt:jobId],
159
+ @"result": result
160
+ }
161
+ ];
162
+ });
163
+ }
147
164
  onEnd: ^(int code) {
148
165
  if (code != 0) {
149
166
  free(waveFile);
@@ -242,27 +259,6 @@ RCT_REMAP_METHOD(releaseAllContexts,
242
259
  resolve(nil);
243
260
  }
244
261
 
245
- - (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
246
- NSURL *url = [NSURL fileURLWithPath:filePath];
247
- NSData *fileData = [NSData dataWithContentsOfURL:url];
248
- if (fileData == nil) {
249
- return nil;
250
- }
251
- NSMutableData *waveData = [[NSMutableData alloc] init];
252
- [waveData appendData:[fileData subdataWithRange:NSMakeRange(44, [fileData length]-44)]];
253
- const short *shortArray = (const short *)[waveData bytes];
254
- int shortCount = (int) ([waveData length] / sizeof(short));
255
- float *floatArray = (float *) malloc(shortCount * sizeof(float));
256
- for (NSInteger i = 0; i < shortCount; i++) {
257
- float floatValue = ((float)shortArray[i]) / 32767.0;
258
- floatValue = MAX(floatValue, -1.0);
259
- floatValue = MIN(floatValue, 1.0);
260
- floatArray[i] = floatValue;
261
- }
262
- *count = shortCount;
263
- return floatArray;
264
- }
265
-
266
262
  - (void)invalidate {
267
263
  [super invalidate];
268
264
 
@@ -283,6 +279,69 @@ RCT_REMAP_METHOD(releaseAllContexts,
283
279
  [RNWhisperDownloader clearCache];
284
280
  }
285
281
 
282
+ // MARK: - AudioSessionUtils
283
+
284
+ RCT_EXPORT_METHOD(getAudioSessionCurrentCategory:(RCTPromiseResolveBlock)resolve
285
+ withRejecter:(RCTPromiseRejectBlock)reject)
286
+ {
287
+ NSString *category = [RNWhisperAudioSessionUtils getCurrentCategory];
288
+ NSArray *options = [RNWhisperAudioSessionUtils getCurrentOptions];
289
+ resolve(@{
290
+ @"category": category,
291
+ @"options": options
292
+ });
293
+ }
294
+
295
+ RCT_EXPORT_METHOD(getAudioSessionCurrentMode:(RCTPromiseResolveBlock)resolve
296
+ withRejecter:(RCTPromiseRejectBlock)reject)
297
+ {
298
+ NSString *mode = [RNWhisperAudioSessionUtils getCurrentMode];
299
+ resolve(mode);
300
+ }
301
+
302
+ RCT_REMAP_METHOD(setAudioSessionCategory,
303
+ withCategory:(NSString *)category
304
+ withOptions:(NSArray *)options
305
+ withResolver:(RCTPromiseResolveBlock)resolve
306
+ withRejecter:(RCTPromiseRejectBlock)reject)
307
+ {
308
+ NSError *error = nil;
309
+ [RNWhisperAudioSessionUtils setCategory:category options:options error:&error];
310
+ if (error != nil) {
311
+ reject(@"whisper_error", [NSString stringWithFormat:@"Failed to set category. Error: %@", error], nil);
312
+ return;
313
+ }
314
+ resolve(nil);
315
+ }
316
+
317
+ RCT_REMAP_METHOD(setAudioSessionMode,
318
+ withMode:(NSString *)mode
319
+ withResolver:(RCTPromiseResolveBlock)resolve
320
+ withRejecter:(RCTPromiseRejectBlock)reject)
321
+ {
322
+ NSError *error = nil;
323
+ [RNWhisperAudioSessionUtils setMode:mode error:&error];
324
+ if (error != nil) {
325
+ reject(@"whisper_error", [NSString stringWithFormat:@"Failed to set mode. Error: %@", error], nil);
326
+ return;
327
+ }
328
+ resolve(nil);
329
+ }
330
+
331
+ RCT_REMAP_METHOD(setAudioSessionActive,
332
+ withActive:(BOOL)active
333
+ withResolver:(RCTPromiseResolveBlock)resolve
334
+ withRejecter:(RCTPromiseRejectBlock)reject)
335
+ {
336
+ NSError *error = nil;
337
+ [RNWhisperAudioSessionUtils setActive:active error:&error];
338
+ if (error != nil) {
339
+ reject(@"whisper_error", [NSString stringWithFormat:@"Failed to set active. Error: %@", error], nil);
340
+ return;
341
+ }
342
+ resolve(nil);
343
+ }
344
+
286
345
  #ifdef RCT_NEW_ARCH_ENABLED
287
346
  - (std::shared_ptr<facebook::react::TurboModule>)getTurboModule:
288
347
  (const facebook::react::ObjCTurboModule::InitParams &)params
@@ -8,6 +8,10 @@
8
8
 
9
9
  /* Begin PBXBuildFile section */
10
10
  5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */ = {isa = PBXBuildFile; fileRef = B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */; };
11
+ 7F458E922AC7DC74007045F6 /* RNWhisperAudioSessionUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = 7F458E912AC7DC74007045F6 /* RNWhisperAudioSessionUtils.m */; };
12
+ 7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */; };
13
+ 7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */; };
14
+ 7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */; };
11
15
  /* End PBXBuildFile section */
12
16
 
13
17
  /* Begin PBXCopyFilesBuildPhase section */
@@ -24,6 +28,15 @@
24
28
 
25
29
  /* Begin PBXFileReference section */
26
30
  134814201AA4EA6300B7C361 /* libRNWhisper.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNWhisper.a; sourceTree = BUILT_PRODUCTS_DIR; };
31
+ 7F458E902AC7DC74007045F6 /* RNWhisperAudioSessionUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperAudioSessionUtils.h; sourceTree = "<group>"; };
32
+ 7F458E912AC7DC74007045F6 /* RNWhisperAudioSessionUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperAudioSessionUtils.m; sourceTree = "<group>"; };
33
+ 7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisper.h; sourceTree = "<group>"; };
34
+ 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperDownloader.m; sourceTree = "<group>"; };
35
+ 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = RNWhisperAudioUtils.m; sourceTree = "<group>"; };
36
+ 7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperContext.h; sourceTree = "<group>"; };
37
+ 7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperDownloader.h; sourceTree = "<group>"; };
38
+ 7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNWhisperAudioUtils.h; sourceTree = "<group>"; };
39
+ 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisperContext.mm; sourceTree = "<group>"; };
27
40
  B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RNWhisper.mm; sourceTree = "<group>"; };
28
41
  /* End PBXFileReference section */
29
42
 
@@ -49,6 +62,15 @@
49
62
  58B511D21A9E6C8500147676 = {
50
63
  isa = PBXGroup;
51
64
  children = (
65
+ 7F458E902AC7DC74007045F6 /* RNWhisperAudioSessionUtils.h */,
66
+ 7F458E912AC7DC74007045F6 /* RNWhisperAudioSessionUtils.m */,
67
+ 7FE0BB9F2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.h */,
68
+ 7FE0BB9C2ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m */,
69
+ 7FE0BB9A2ABE6C7B0049B4E4 /* RNWhisper.h */,
70
+ 7FE0BB9D2ABE6C7B0049B4E4 /* RNWhisperContext.h */,
71
+ 7FE0BBA02ABE6C7B0049B4E4 /* RNWhisperContext.mm */,
72
+ 7FE0BB9E2ABE6C7B0049B4E4 /* RNWhisperDownloader.h */,
73
+ 7FE0BB9B2ABE6C7B0049B4E4 /* RNWhisperDownloader.m */,
52
74
  B3E7B5891CC2AC0600A0062D /* RNWhisper.mm */,
53
75
  134814211AA4EA7D00B7C361 /* Products */,
54
76
  );
@@ -112,6 +134,10 @@
112
134
  buildActionMask = 2147483647;
113
135
  files = (
114
136
  5E555C0D2413F4C50049A1A2 /* RNWhisper.mm in Sources */,
137
+ 7FE0BBA22ABE6C7B0049B4E4 /* RNWhisperAudioUtils.m in Sources */,
138
+ 7FE0BBA32ABE6C7B0049B4E4 /* RNWhisperContext.mm in Sources */,
139
+ 7FE0BBA12ABE6C7B0049B4E4 /* RNWhisperDownloader.m in Sources */,
140
+ 7F458E922AC7DC74007045F6 /* RNWhisperAudioSessionUtils.m in Sources */,
115
141
  );
116
142
  runOnlyForDeploymentPostprocessing = 0;
117
143
  };
@@ -223,9 +249,7 @@
223
249
  "$(SRCROOT)/../../react-native/React/**",
224
250
  );
225
251
  LIBRARY_SEARCH_PATHS = "$(inherited)";
226
- OTHER_LDFLAGS = (
227
- "-ObjC",
228
- );
252
+ OTHER_LDFLAGS = "-ObjC";
229
253
  PRODUCT_NAME = RNWhisper;
230
254
  SKIP_INSTALL = YES;
231
255
  };
@@ -4,6 +4,11 @@
4
4
  <dict>
5
5
  <key>SchemeUserState</key>
6
6
  <dict>
7
+ <key>RNWhisper.xcscheme_^#shared#^_</key>
8
+ <dict>
9
+ <key>orderHint</key>
10
+ <integer>0</integer>
11
+ </dict>
7
12
  <key>WhisperCpp.xcscheme_^#shared#^_</key>
8
13
  <dict>
9
14
  <key>orderHint</key>
@@ -0,0 +1,13 @@
1
+ #import <Foundation/Foundation.h>
2
+ #import <AVFoundation/AVFoundation.h>
3
+
4
+ @interface RNWhisperAudioSessionUtils : NSObject
5
+
6
+ +(NSString *)getCurrentCategory;
7
+ +(NSArray *)getCurrentOptions;
8
+ +(NSString *)getCurrentMode;
9
+ +(void)setCategory:(NSString *)category options:(NSArray *)options error:(NSError **)error;
10
+ +(void)setMode:(NSString *)mode error:(NSError **)error;
11
+ +(void)setActive:(BOOL)active error:(NSError **)error;
12
+
13
+ @end
@@ -0,0 +1,85 @@
1
+ #import "RNWhisperAudioSessionUtils.h"
2
+
3
+ @implementation RNWhisperAudioSessionUtils
4
+
5
+ static NSDictionary *_categories;
6
+ static NSDictionary *_options;
7
+ static NSDictionary *_modes;
8
+
9
+ + (void)initialize {
10
+ _categories = @{
11
+ @"Ambient": AVAudioSessionCategoryAmbient,
12
+ @"SoloAmbient": AVAudioSessionCategorySoloAmbient,
13
+ @"Playback": AVAudioSessionCategoryPlayback,
14
+ @"Record": AVAudioSessionCategoryRecord,
15
+ @"PlayAndRecord": AVAudioSessionCategoryPlayAndRecord,
16
+ @"MultiRoute": AVAudioSessionCategoryMultiRoute
17
+ };
18
+ _options = @{
19
+ @"MixWithOthers": @(AVAudioSessionCategoryOptionMixWithOthers),
20
+ @"DuckOthers": @(AVAudioSessionCategoryOptionDuckOthers),
21
+ @"InterruptSpokenAudioAndMixWithOthers": @(AVAudioSessionCategoryOptionInterruptSpokenAudioAndMixWithOthers),
22
+ @"AllowBluetooth": @(AVAudioSessionCategoryOptionAllowBluetooth),
23
+ @"AllowBluetoothA2DP": @(AVAudioSessionCategoryOptionAllowBluetoothA2DP),
24
+ @"AllowAirPlay": @(AVAudioSessionCategoryOptionAllowAirPlay),
25
+ @"DefaultToSpeaker": @(AVAudioSessionCategoryOptionDefaultToSpeaker)
26
+ };
27
+ _modes = @{
28
+ @"Default": AVAudioSessionModeDefault,
29
+ @"VoiceChat": AVAudioSessionModeVoiceChat,
30
+ @"VideoChat": AVAudioSessionModeVideoChat,
31
+ @"GameChat": AVAudioSessionModeGameChat,
32
+ @"VideoRecording": AVAudioSessionModeVideoRecording,
33
+ @"Measurement": AVAudioSessionModeMeasurement,
34
+ @"MoviePlayback": AVAudioSessionModeMoviePlayback,
35
+ @"SpokenAudio": AVAudioSessionModeSpokenAudio
36
+ };
37
+ }
38
+
39
+ +(NSString *)getCurrentCategory {
40
+ AVAudioSession *session = [AVAudioSession sharedInstance];
41
+ return session.category;
42
+ }
43
+
44
+ +(NSArray *)getCurrentOptions {
45
+ AVAudioSession *session = [AVAudioSession sharedInstance];
46
+ AVAudioSessionCategoryOptions options = session.categoryOptions;
47
+ NSMutableArray *result = [NSMutableArray array];
48
+ for (NSString *key in _options) {
49
+ if ((options & [[_options objectForKey:key] unsignedIntegerValue]) != 0) {
50
+ [result addObject:key];
51
+ }
52
+ }
53
+ return result;
54
+ }
55
+
56
+ +(NSString *)getCurrentMode {
57
+ AVAudioSession *session = [AVAudioSession sharedInstance];
58
+ return session.mode;
59
+ }
60
+
61
+ +(AVAudioSessionCategoryOptions)getOptions:(NSArray *)options {
62
+ AVAudioSessionCategoryOptions result = 0;
63
+ for (NSString *option in options) {
64
+ result |= [[_options objectForKey:option] unsignedIntegerValue];
65
+ }
66
+ return result;
67
+ }
68
+
69
+ +(void)setCategory:(NSString *)category options:(NSArray *)options error:(NSError **)error {
70
+ AVAudioSession *session = [AVAudioSession sharedInstance];
71
+ [session setCategory:[_categories objectForKey:category] withOptions:[self getOptions:options] error:error];
72
+ }
73
+
74
+ +(void)setMode:(NSString *)mode error:(NSError **)error {
75
+ AVAudioSession *session = [AVAudioSession sharedInstance];
76
+ [session setMode:[_modes objectForKey:mode] error:error];
77
+ }
78
+
79
+ +(void)setActive:(BOOL)active error:(NSError **)error {
80
+ AVAudioSession *session = [AVAudioSession sharedInstance];
81
+ [session setActive:active error:error];
82
+ }
83
+
84
+
85
+ @end
@@ -0,0 +1,9 @@
1
+ #import <Foundation/Foundation.h>
2
+
3
+ @interface RNWhisperAudioUtils : NSObject
4
+
5
+ + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
6
+ + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
7
+ + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
8
+
9
+ @end
@@ -0,0 +1,83 @@
1
+ #import "RNWhisperAudioUtils.h"
2
+ #import "whisper.h"
3
+
4
+ @implementation RNWhisperAudioUtils
5
+
6
+ + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
7
+ NSMutableData *outputData = [NSMutableData data];
8
+ for (int i = 0; i < buffers.count; i++) {
9
+ int size = [sliceNSamples objectAtIndex:i].intValue;
10
+ NSValue *buffer = [buffers objectAtIndex:i];
11
+ short *bufferPtr = buffer.pointerValue;
12
+ [outputData appendBytes:bufferPtr length:size * sizeof(short)];
13
+ }
14
+ return outputData;
15
+ }
16
+
17
+ + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
18
+ NSMutableData *outputData = [NSMutableData data];
19
+
20
+ // WAVE header
21
+ [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
22
+ int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
23
+ [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
24
+ [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
25
+ [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
26
+
27
+ int subchunk1Size = CFSwapInt32HostToLittle(16);
28
+ [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
29
+
30
+ short audioFormat = CFSwapInt16HostToLittle(1); // PCM
31
+ [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
32
+
33
+ short numChannels = CFSwapInt16HostToLittle(1); // mono
34
+ [outputData appendBytes:&numChannels length:sizeof(numChannels)];
35
+
36
+ int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
37
+ [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
38
+
39
+ // (bitDepth * sampleRate * channels) >> 3
40
+ int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
41
+ [outputData appendBytes:&byteRate length:sizeof(byteRate)];
42
+
43
+ // (bitDepth * channels) >> 3
44
+ short blockAlign = CFSwapInt16HostToLittle(16 / 8);
45
+ [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
46
+
47
+ // bitDepth
48
+ short bitsPerSample = CFSwapInt16HostToLittle(16);
49
+ [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
50
+
51
+ [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
52
+ int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
53
+ [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
54
+
55
+ // Audio data
56
+ [outputData appendData:rawData];
57
+
58
+ // Save to file
59
+ [outputData writeToFile:audioOutputFile atomically:YES];
60
+ }
61
+
62
+ + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
63
+ NSURL *url = [NSURL fileURLWithPath:filePath];
64
+ NSData *fileData = [NSData dataWithContentsOfURL:url];
65
+ if (fileData == nil) {
66
+ return nil;
67
+ }
68
+ NSMutableData *waveData = [[NSMutableData alloc] init];
69
+ [waveData appendData:[fileData subdataWithRange:NSMakeRange(44, [fileData length]-44)]];
70
+ const short *shortArray = (const short *)[waveData bytes];
71
+ int shortCount = (int) ([waveData length] / sizeof(short));
72
+ float *floatArray = (float *) malloc(shortCount * sizeof(float));
73
+ for (NSInteger i = 0; i < shortCount; i++) {
74
+ float floatValue = ((float)shortArray[i]) / 32767.0;
75
+ floatValue = MAX(floatValue, -1.0);
76
+ floatValue = MIN(floatValue, 1.0);
77
+ floatArray[i] = floatValue;
78
+ }
79
+ *count = shortCount;
80
+ return floatArray;
81
+ }
82
+
83
+ @end
@@ -53,6 +53,7 @@ typedef struct {
53
53
  audioDataCount:(int)audioDataCount
54
54
  options:(NSDictionary *)options
55
55
  onProgress:(void (^)(int))onProgress
56
+ onNewSegments:(void (^)(NSDictionary *))onNewSegments
56
57
  onEnd:(void (^)(int))onEnd;
57
58
  - (void)stopTranscribe:(int)jobId;
58
59
  - (void)stopCurrentTranscribe;
@@ -1,4 +1,6 @@
1
1
  #import "RNWhisperContext.h"
2
+ #import "RNWhisperAudioUtils.h"
3
+ #include <vector>
2
4
 
3
5
  #define NUM_BYTES_PER_BUFFER 16 * 1024
4
6
 
@@ -77,6 +79,29 @@
77
79
  }
78
80
  }
79
81
 
82
+ bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
83
+ {
84
+ bool isSpeech = true;
85
+ if (!state->isTranscribing && state->options[@"useVad"]) {
86
+ int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
87
+ int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
88
+ if (nSamples + n > sampleSize) {
89
+ int start = nSamples + n - sampleSize;
90
+ std::vector<float> audioBufferF32Vec(sampleSize);
91
+ for (int i = 0; i < sampleSize; i++) {
92
+ audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
93
+ }
94
+ float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
95
+ float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
96
+ isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
97
+ NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
98
+ } else {
99
+ isSpeech = false;
100
+ }
101
+ }
102
+ return isSpeech;
103
+ }
104
+
80
105
  void AudioInputCallback(void * inUserData,
81
106
  AudioQueueRef inAQ,
82
107
  AudioQueueBufferRef inBuffer,
@@ -117,6 +142,11 @@ void AudioInputCallback(void * inUserData,
117
142
  !state->isTranscribing &&
118
143
  nSamples != state->nSamplesTranscribing
119
144
  ) {
145
+ int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
146
+ if (!vad(state, audioBufferI16, nSamples, 0)) {
147
+ state->transcribeHandler(state->jobId, @"end", @{});
148
+ return;
149
+ }
120
150
  state->isTranscribing = true;
121
151
  dispatch_async([state->mSelf getDispatchQueue], ^{
122
152
  [state->mSelf fullTranscribeSamples:state];
@@ -142,11 +172,15 @@ void AudioInputCallback(void * inUserData,
142
172
  for (int i = 0; i < n; i++) {
143
173
  audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
144
174
  }
175
+
176
+ bool isSpeech = vad(state, audioBufferI16, nSamples, n);
145
177
  nSamples += n;
146
178
  state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
147
179
 
148
180
  AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
149
181
 
182
+ if (!isSpeech) return;
183
+
150
184
  if (!state->isTranscribing) {
151
185
  state->isTranscribing = true;
152
186
  dispatch_async([state->mSelf getDispatchQueue], ^{
@@ -167,7 +201,8 @@ void AudioInputCallback(void * inUserData,
167
201
  audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
168
202
  }
169
203
  CFTimeInterval timeStart = CACurrentMediaTime();
170
- int code = [state->mSelf fullTranscribe:state->jobId audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing options:state->options];
204
+ struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
205
+ int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
171
206
  free(audioBufferF32);
172
207
  CFTimeInterval timeEnd = CACurrentMediaTime();
173
208
  const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
@@ -212,6 +247,17 @@ void AudioInputCallback(void * inUserData,
212
247
  NSLog(@"[RNWhisper] Transcribe end");
213
248
  result[@"isStoppedByAction"] = @(state->isStoppedByAction);
214
249
  result[@"isCapturing"] = @(false);
250
+
251
+ // Save wav if needed
252
+ if (state->options[@"audioOutputPath"] != nil) {
253
+ // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
254
+ [RNWhisperAudioUtils
255
+ saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
256
+ sliceNSamples:state->sliceNSamples]
257
+ audioOutputFile:state->options[@"audioOutputPath"]
258
+ ];
259
+ }
260
+
215
261
  state->transcribeHandler(state->jobId, @"end", result);
216
262
  } else if (code == 0) {
217
263
  result[@"isCapturing"] = @(true);
@@ -272,18 +318,70 @@ void AudioInputCallback(void * inUserData,
272
318
  return status;
273
319
  }
274
320
 
321
+ struct rnwhisper_segments_callback_data {
322
+ void (^onNewSegments)(NSDictionary *);
323
+ int total_n_new;
324
+ };
325
+
275
326
  - (void)transcribeFile:(int)jobId
276
327
  audioData:(float *)audioData
277
328
  audioDataCount:(int)audioDataCount
278
329
  options:(NSDictionary *)options
279
330
  onProgress:(void (^)(int))onProgress
331
+ onNewSegments:(void (^)(NSDictionary *))onNewSegments
280
332
  onEnd:(void (^)(int))onEnd
281
333
  {
282
334
  dispatch_async(dQueue, ^{
283
335
  self->recordState.isStoppedByAction = false;
284
336
  self->recordState.isTranscribing = true;
285
337
  self->recordState.jobId = jobId;
286
- int code = [self fullTranscribeWithProgress:onProgress jobId:jobId audioData:audioData audioDataCount:audioDataCount options:options];
338
+
339
+ whisper_full_params params = [self getParams:options jobId:jobId];
340
+ if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
341
+ params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
342
+ void (^onProgress)(int) = (__bridge void (^)(int))user_data;
343
+ onProgress(progress);
344
+ };
345
+ params.progress_callback_user_data = (__bridge void *)(onProgress);
346
+ }
347
+
348
+ if (options[@"onNewSegments"] && [options[@"onNewSegments"] boolValue]) {
349
+ params.new_segment_callback = [](struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
350
+ struct rnwhisper_segments_callback_data *data = (struct rnwhisper_segments_callback_data *)user_data;
351
+ data->total_n_new += n_new;
352
+
353
+ NSString *text = @"";
354
+ NSMutableArray *segments = [[NSMutableArray alloc] init];
355
+ for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
356
+ const char * text_cur = whisper_full_get_segment_text(ctx, i);
357
+ text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
358
+
359
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
360
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
361
+ NSDictionary *segment = @{
362
+ @"text": [NSString stringWithUTF8String:text_cur],
363
+ @"t0": [NSNumber numberWithLongLong:t0],
364
+ @"t1": [NSNumber numberWithLongLong:t1]
365
+ };
366
+ [segments addObject:segment];
367
+ }
368
+
369
+ NSDictionary *result = @{
370
+ @"nNew": [NSNumber numberWithInt:n_new],
371
+ @"totalNNew": [NSNumber numberWithInt:data->total_n_new],
372
+ @"result": text,
373
+ @"segments": segments
374
+ };
375
+ void (^onNewSegments)(NSDictionary *) = (void (^)(NSDictionary *))data->onNewSegments;
376
+ onNewSegments(result);
377
+ };
378
+ struct rnwhisper_segments_callback_data user_data = {
379
+ .onNewSegments = onNewSegments,
380
+ .total_n_new = 0
381
+ };
382
+ params.new_segment_callback_user_data = &user_data;
383
+ }
384
+ int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
287
385
  self->recordState.jobId = -1;
288
386
  self->recordState.isTranscribing = false;
289
387
  onEnd(code);
@@ -383,36 +481,11 @@ void AudioInputCallback(void * inUserData,
383
481
  return params;
384
482
  }
385
483
 
386
- - (int)fullTranscribeWithProgress:(void (^)(int))onProgress
387
- jobId:(int)jobId
388
- audioData:(float *)audioData
389
- audioDataCount:(int)audioDataCount
390
- options:(NSDictionary *)options
391
- {
392
- struct whisper_full_params params = [self getParams:options jobId:jobId];
393
- if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
394
- params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
395
- void (^onProgress)(int) = (__bridge void (^)(int))user_data;
396
- onProgress(progress);
397
- };
398
- params.progress_callback_user_data = (__bridge void *)(onProgress);
399
- }
400
- whisper_reset_timings(self->ctx);
401
-
402
- int code = whisper_full(self->ctx, params, audioData, audioDataCount);
403
- rn_whisper_remove_abort_map(jobId);
404
- // if (code == 0) {
405
- // whisper_print_timings(self->ctx);
406
- // }
407
- return code;
408
- }
409
-
410
484
  - (int)fullTranscribe:(int)jobId
485
+ params:(struct whisper_full_params)params
411
486
  audioData:(float *)audioData
412
487
  audioDataCount:(int)audioDataCount
413
- options:(NSDictionary *)options
414
488
  {
415
- struct whisper_full_params params = [self getParams:options jobId:jobId];
416
489
  whisper_reset_timings(self->ctx);
417
490
 
418
491
  int code = whisper_full(self->ctx, params, audioData, audioDataCount);