whisper.rn 0.4.0-rc.0 → 0.4.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +14 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
  7. package/android/src/main/jni-utils.h +76 -0
  8. package/android/src/main/jni.cpp +226 -109
  9. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  10. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  11. package/cpp/README.md +1 -1
  12. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  13. package/cpp/coreml/whisper-encoder.h +4 -0
  14. package/cpp/coreml/whisper-encoder.mm +5 -3
  15. package/cpp/ggml-aarch64.c +129 -0
  16. package/cpp/ggml-aarch64.h +19 -0
  17. package/cpp/ggml-alloc.c +805 -400
  18. package/cpp/ggml-alloc.h +60 -10
  19. package/cpp/ggml-backend-impl.h +216 -0
  20. package/cpp/ggml-backend-reg.cpp +204 -0
  21. package/cpp/ggml-backend.cpp +1996 -0
  22. package/cpp/ggml-backend.cpp.rej +12 -0
  23. package/cpp/ggml-backend.h +336 -0
  24. package/cpp/ggml-common.h +1853 -0
  25. package/cpp/ggml-cpp.h +38 -0
  26. package/cpp/ggml-cpu-aarch64.c +3560 -0
  27. package/cpp/ggml-cpu-aarch64.h +30 -0
  28. package/cpp/ggml-cpu-impl.h +371 -0
  29. package/cpp/ggml-cpu-quants.c +10822 -0
  30. package/cpp/ggml-cpu-quants.h +63 -0
  31. package/cpp/ggml-cpu.c +13970 -0
  32. package/cpp/ggml-cpu.cpp +663 -0
  33. package/cpp/ggml-cpu.h +177 -0
  34. package/cpp/ggml-impl.h +551 -0
  35. package/cpp/ggml-metal-impl.h +249 -0
  36. package/cpp/ggml-metal.h +24 -43
  37. package/cpp/ggml-metal.m +4190 -1075
  38. package/cpp/ggml-quants.c +5247 -0
  39. package/cpp/ggml-quants.h +100 -0
  40. package/cpp/ggml-threading.cpp +12 -0
  41. package/cpp/ggml-threading.h +12 -0
  42. package/cpp/ggml-whisper.metallib +0 -0
  43. package/cpp/ggml.c +5474 -18763
  44. package/cpp/ggml.h +833 -628
  45. package/cpp/rn-audioutils.cpp +68 -0
  46. package/cpp/rn-audioutils.h +14 -0
  47. package/cpp/rn-whisper-log.h +11 -0
  48. package/cpp/rn-whisper.cpp +221 -52
  49. package/cpp/rn-whisper.h +50 -15
  50. package/cpp/whisper.cpp +2863 -1340
  51. package/cpp/whisper.h +170 -38
  52. package/ios/RNWhisper.mm +141 -46
  53. package/ios/RNWhisperAudioUtils.h +1 -2
  54. package/ios/RNWhisperAudioUtils.m +18 -67
  55. package/ios/RNWhisperContext.h +11 -8
  56. package/ios/RNWhisperContext.mm +197 -144
  57. package/jest/mock.js +15 -2
  58. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  59. package/lib/commonjs/index.js +78 -28
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/commonjs/version.json +1 -1
  62. package/lib/module/NativeRNWhisper.js.map +1 -1
  63. package/lib/module/index.js +78 -28
  64. package/lib/module/index.js.map +1 -1
  65. package/lib/module/version.json +1 -1
  66. package/lib/typescript/NativeRNWhisper.d.ts +14 -4
  67. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  68. package/lib/typescript/index.d.ts +39 -5
  69. package/lib/typescript/index.d.ts.map +1 -1
  70. package/package.json +9 -7
  71. package/src/NativeRNWhisper.ts +21 -4
  72. package/src/index.ts +102 -42
  73. package/src/version.json +1 -1
  74. package/whisper-rn.podspec +11 -18
  75. package/cpp/ggml-metal.metal +0 -2353
  76. package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
  77. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
  78. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  79. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
@@ -1,22 +1,97 @@
1
1
  #import "RNWhisperContext.h"
2
- #import "RNWhisperAudioUtils.h"
2
+ #import <Metal/Metal.h>
3
3
  #include <vector>
4
4
 
5
5
  #define NUM_BYTES_PER_BUFFER 16 * 1024
6
6
 
7
7
  @implementation RNWhisperContext
8
8
 
9
- + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId {
9
+ + (instancetype)initWithModelPath:(NSString *)modelPath
10
+ contextId:(int)contextId
11
+ noCoreML:(BOOL)noCoreML
12
+ noMetal:(BOOL)noMetal
13
+ useFlashAttn:(BOOL)useFlashAttn
14
+ {
10
15
  RNWhisperContext *context = [[RNWhisperContext alloc] init];
11
16
  context->contextId = contextId;
12
- context->ctx = whisper_init_from_file([modelPath UTF8String]);
17
+ struct whisper_context_params cparams;
18
+ NSString *reasonNoMetal = @"";
19
+ cparams.use_gpu = !noMetal;
20
+ cparams.flash_attn = useFlashAttn;
21
+
22
+ // TODO: Expose dtw_token_timestamps and dtw_aheads_preset
23
+ cparams.dtw_token_timestamps = false;
24
+ // cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
25
+
26
+ cparams.use_coreml = !noCoreML;
27
+ #ifndef WHISPER_USE_COREML
28
+ if (cparams.use_coreml) {
29
+ NSLog(@"[RNWhisper] CoreML is not enabled in this build, ignoring use_coreml option");
30
+ cparams.use_coreml = false;
31
+ }
32
+ #endif
33
+
34
+ #ifndef WSP_GGML_USE_METAL
35
+ if (cparams.use_gpu) {
36
+ NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
37
+ cparams.use_gpu = false;
38
+ }
39
+ #endif
40
+
41
+ #ifdef WSP_GGML_USE_METAL
42
+ if (cparams.use_gpu) {
43
+ #if TARGET_OS_SIMULATOR
44
+ NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
45
+ cparams.use_gpu = false;
46
+ #else // TARGET_OS_SIMULATOR
47
+ // Check ggml-metal availability
48
+ NSError * error = nil;
49
+ id<MTLDevice> device = MTLCreateSystemDefaultDevice();
50
+ id<MTLLibrary> library = [device
51
+ newLibraryWithSource:@"#include <metal_stdlib>\n"
52
+ "using namespace metal;"
53
+ "kernel void test() { simd_sum(0); }"
54
+ options:nil
55
+ error:&error
56
+ ];
57
+ if (error) {
58
+ reasonNoMetal = [error localizedDescription];
59
+ } else {
60
+ id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
61
+ id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
62
+ if (pipeline == nil) {
63
+ reasonNoMetal = [error localizedDescription];
64
+ NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
65
+ cparams.use_gpu = false;
66
+ }
67
+ }
68
+ #endif // TARGET_OS_SIMULATOR
69
+ }
70
+ #endif // WSP_GGML_USE_METAL
71
+
72
+ if (cparams.use_gpu && cparams.use_coreml) {
73
+ NSLog(@"[RNWhisper] Both use_gpu and use_coreml are enabled, ignoring use_coreml option");
74
+ cparams.use_coreml = false; // Skip CoreML if Metal is enabled
75
+ }
76
+
77
+ context->ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
13
78
  context->dQueue = dispatch_queue_create(
14
79
  [[NSString stringWithFormat:@"RNWhisperContext-%d", contextId] UTF8String],
15
80
  DISPATCH_QUEUE_SERIAL
16
81
  );
82
+ context->isMetalEnabled = cparams.use_gpu;
83
+ context->reasonNoMetal = reasonNoMetal;
17
84
  return context;
18
85
  }
19
86
 
87
+ - (bool)isMetalEnabled {
88
+ return isMetalEnabled;
89
+ }
90
+
91
+ - (NSString *)reasonNoMetal {
92
+ return reasonNoMetal;
93
+ }
94
+
20
95
  - (struct whisper_context *)getContext {
21
96
  return self->ctx;
22
97
  }
@@ -25,7 +100,7 @@
25
100
  return self->dQueue;
26
101
  }
27
102
 
28
- - (void)prepareRealtime:(NSDictionary *)options {
103
+ - (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
29
104
  self->recordState.options = options;
30
105
 
31
106
  self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
@@ -38,68 +113,40 @@
38
113
  self->recordState.dataFormat.mReserved = 0;
39
114
  self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
40
115
 
41
- int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
42
- int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
43
- self->recordState.maxAudioSec = maxAudioSec;
44
-
45
- int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
46
- int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
47
-
48
- self->recordState.audioSliceSec = audioSliceSec;
49
- self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
116
+ self->recordState.isRealtime = true;
117
+ self->recordState.isTranscribing = false;
118
+ self->recordState.isCapturing = false;
119
+ self->recordState.isStoppedByAction = false;
50
120
 
51
121
  self->recordState.sliceIndex = 0;
52
122
  self->recordState.transcribeSliceIndex = 0;
53
123
  self->recordState.nSamplesTranscribing = 0;
54
124
 
55
- [self freeBufferIfNeeded];
56
- self->recordState.shortBufferSlices = [NSMutableArray new];
57
-
58
- int16_t *audioBufferI16 = (int16_t *) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
59
- [self->recordState.shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
60
-
61
- self->recordState.sliceNSamples = [NSMutableArray new];
62
- [self->recordState.sliceNSamples addObject:[NSNumber numberWithInt:0]];
63
-
64
- self->recordState.isRealtime = true;
65
- self->recordState.isTranscribing = false;
66
- self->recordState.isCapturing = false;
67
- self->recordState.isStoppedByAction = false;
125
+ self->recordState.sliceNSamples.clear();
126
+ self->recordState.sliceNSamples.push_back(0);
127
+
128
+ self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
129
+ self->recordState.job->set_realtime_params(
130
+ {
131
+ .use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
132
+ .vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
133
+ .vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
134
+ .freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
135
+ },
136
+ options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
137
+ options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
138
+ options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
139
+ options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
140
+ );
141
+ self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
68
142
 
69
143
  self->recordState.mSelf = self;
70
144
  }
71
145
 
72
- - (void)freeBufferIfNeeded {
73
- if (self->recordState.shortBufferSlices != nil) {
74
- for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
75
- int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
76
- free(audioBufferI16);
77
- }
78
- self->recordState.shortBufferSlices = nil;
79
- }
80
- }
81
-
82
- bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
146
+ bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
83
147
  {
84
- bool isSpeech = true;
85
- if (!state->isTranscribing && state->options[@"useVad"]) {
86
- int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
87
- int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
88
- if (nSamples + n > sampleSize) {
89
- int start = nSamples + n - sampleSize;
90
- std::vector<float> audioBufferF32Vec(sampleSize);
91
- for (int i = 0; i < sampleSize; i++) {
92
- audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
93
- }
94
- float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
95
- float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
96
- isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
97
- NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
98
- } else {
99
- isSpeech = false;
100
- }
101
- }
102
- return isSpeech;
148
+ if (state->isTranscribing) return true;
149
+ return state->job->vad_simple(sliceIndex, nSamples, n);
103
150
  }
104
151
 
105
152
  void AudioInputCallback(void * inUserData,
@@ -114,21 +161,21 @@ void AudioInputCallback(void * inUserData,
114
161
  if (!state->isCapturing) {
115
162
  NSLog(@"[RNWhisper] Not capturing, ignoring audio");
116
163
  if (!state->isTranscribing) {
117
- state->transcribeHandler(state->jobId, @"end", @{});
164
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
118
165
  }
119
166
  return;
120
167
  }
121
168
 
122
169
  int totalNSamples = 0;
123
- for (int i = 0; i < [state->sliceNSamples count]; i++) {
124
- totalNSamples += [[state->sliceNSamples objectAtIndex:i] intValue];
170
+ for (int i = 0; i < state->sliceNSamples.size(); i++) {
171
+ totalNSamples += state->sliceNSamples[i];
125
172
  }
126
173
 
127
174
  const int n = inBuffer->mAudioDataByteSize / 2;
128
175
 
129
- int nSamples = [state->sliceNSamples[state->sliceIndex] intValue];
176
+ int nSamples = state->sliceNSamples[state->sliceIndex];
130
177
 
131
- if (totalNSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
178
+ if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
132
179
  NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
133
180
  state->isCapturing = false;
134
181
  [state->mSelf stopAudio];
@@ -137,14 +184,14 @@ void AudioInputCallback(void * inUserData,
137
184
  nSamples == state->nSamplesTranscribing &&
138
185
  state->sliceIndex == state->transcribeSliceIndex
139
186
  ) {
140
- state->transcribeHandler(state->jobId, @"end", @{});
187
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
141
188
  } else if (
142
189
  !state->isTranscribing &&
143
190
  nSamples != state->nSamplesTranscribing
144
191
  ) {
145
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
146
- if (!vad(state, audioBufferI16, nSamples, 0)) {
147
- state->transcribeHandler(state->jobId, @"end", @{});
192
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
193
+ if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
194
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
148
195
  return;
149
196
  }
150
197
  state->isTranscribing = true;
@@ -155,31 +202,25 @@ void AudioInputCallback(void * inUserData,
155
202
  return;
156
203
  }
157
204
 
158
- int audioSliceSec = state->audioSliceSec;
159
- if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
205
+ if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
160
206
  // next slice
161
207
  state->sliceIndex++;
162
208
  nSamples = 0;
163
- int16_t* audioBufferI16 = (int16_t*) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
164
- [state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
165
- [state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
209
+ state->sliceNSamples.push_back(0);
166
210
  }
167
211
 
168
- // Append to buffer
169
- NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
212
+ NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
170
213
 
171
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
172
- for (int i = 0; i < n; i++) {
173
- audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
174
- }
214
+ state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
175
215
 
176
- bool isSpeech = vad(state, audioBufferI16, nSamples, n);
216
+ bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
177
217
  nSamples += n;
178
- state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
218
+ state->sliceNSamples[state->sliceIndex] = nSamples;
179
219
 
180
220
  AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
181
221
 
182
- if (!isSpeech) return;
222
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
223
+ if (!isSamplesEnough || !isSpeech) return;
183
224
 
184
225
  if (!state->isTranscribing) {
185
226
  state->isTranscribing = true;
@@ -189,21 +230,29 @@ void AudioInputCallback(void * inUserData,
189
230
  }
190
231
  }
191
232
 
233
+ - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
234
+ // Save wav if needed
235
+ if (state->job->audio_output_path != nullptr) {
236
+ // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
237
+ rnaudioutils::save_wav_file(
238
+ rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
239
+ state->job->audio_output_path
240
+ );
241
+ }
242
+ state->transcribeHandler(state->job->job_id, @"end", result);
243
+ rnwhisper::job_remove(state->job->job_id);
244
+ }
245
+
192
246
  - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
193
- int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
247
+ int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
194
248
  state->nSamplesTranscribing = nSamplesOfIndex;
195
249
  NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
196
250
 
197
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->transcribeSliceIndex] pointerValue];
198
- float* audioBufferF32 = (float*) malloc(state->nSamplesTranscribing * sizeof(float));
199
- // convert I16 to F32
200
- for (int i = 0; i < state->nSamplesTranscribing; i++) {
201
- audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
202
- }
251
+ float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
252
+
203
253
  CFTimeInterval timeStart = CACurrentMediaTime();
204
- struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
205
- int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
206
- free(audioBufferF32);
254
+ int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
255
+ free(pcmf32);
207
256
  CFTimeInterval timeEnd = CACurrentMediaTime();
208
257
  const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
209
258
 
@@ -223,7 +272,7 @@ void AudioInputCallback(void * inUserData,
223
272
  result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
224
273
  }
225
274
 
226
- nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
275
+ nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
227
276
 
228
277
  bool isStopped = state->isStoppedByAction || (
229
278
  !state->isCapturing &&
@@ -248,23 +297,13 @@ void AudioInputCallback(void * inUserData,
248
297
  result[@"isStoppedByAction"] = @(state->isStoppedByAction);
249
298
  result[@"isCapturing"] = @(false);
250
299
 
251
- // Save wav if needed
252
- if (state->options[@"audioOutputPath"] != nil) {
253
- // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
254
- [RNWhisperAudioUtils
255
- saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
256
- sliceNSamples:state->sliceNSamples]
257
- audioOutputFile:state->options[@"audioOutputPath"]
258
- ];
259
- }
260
-
261
- state->transcribeHandler(state->jobId, @"end", result);
300
+ [state->mSelf finishRealtimeTranscribe:state result:result];
262
301
  } else if (code == 0) {
263
302
  result[@"isCapturing"] = @(true);
264
- state->transcribeHandler(state->jobId, @"transcribe", result);
303
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
265
304
  } else {
266
305
  result[@"isCapturing"] = @(true);
267
- state->transcribeHandler(state->jobId, @"transcribe", result);
306
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
268
307
  }
269
308
 
270
309
  if (continueNeeded) {
@@ -292,8 +331,7 @@ void AudioInputCallback(void * inUserData,
292
331
  onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
293
332
  {
294
333
  self->recordState.transcribeHandler = onTranscribe;
295
- self->recordState.jobId = jobId;
296
- [self prepareRealtime:options];
334
+ [self prepareRealtime:jobId options:options];
297
335
 
298
336
  OSStatus status = AudioQueueNewInput(
299
337
  &self->recordState.dataFormat,
@@ -321,9 +359,10 @@ void AudioInputCallback(void * inUserData,
321
359
  struct rnwhisper_segments_callback_data {
322
360
  void (^onNewSegments)(NSDictionary *);
323
361
  int total_n_new;
362
+ bool tdrzEnable;
324
363
  };
325
364
 
326
- - (void)transcribeFile:(int)jobId
365
+ - (void)transcribeData:(int)jobId
327
366
  audioData:(float *)audioData
328
367
  audioDataCount:(int)audioDataCount
329
368
  options:(NSDictionary *)options
@@ -334,9 +373,9 @@ struct rnwhisper_segments_callback_data {
334
373
  dispatch_async(dQueue, ^{
335
374
  self->recordState.isStoppedByAction = false;
336
375
  self->recordState.isTranscribing = true;
337
- self->recordState.jobId = jobId;
338
376
 
339
- whisper_full_params params = [self getParams:options jobId:jobId];
377
+ whisper_full_params params = [self createParams:options jobId:jobId];
378
+
340
379
  if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
341
380
  params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
342
381
  void (^onProgress)(int) = (__bridge void (^)(int))user_data;
@@ -354,12 +393,18 @@ struct rnwhisper_segments_callback_data {
354
393
  NSMutableArray *segments = [[NSMutableArray alloc] init];
355
394
  for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
356
395
  const char * text_cur = whisper_full_get_segment_text(ctx, i);
357
- text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
396
+ NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
397
+
398
+ if (data->tdrzEnable && whisper_full_get_segment_speaker_turn_next(ctx, i)) {
399
+ [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
400
+ }
401
+
402
+ text = [text stringByAppendingString:mutable_ns_text];
358
403
 
359
404
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
360
405
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
361
406
  NSDictionary *segment = @{
362
- @"text": [NSString stringWithUTF8String:text_cur],
407
+ @"text": [NSString stringWithString:mutable_ns_text],
363
408
  @"t0": [NSNumber numberWithLongLong:t0],
364
409
  @"t1": [NSNumber numberWithLongLong:t1]
365
410
  };
@@ -377,12 +422,16 @@ struct rnwhisper_segments_callback_data {
377
422
  };
378
423
  struct rnwhisper_segments_callback_data user_data = {
379
424
  .onNewSegments = onNewSegments,
380
- .total_n_new = 0
425
+ .tdrzEnable = options[@"tdrzEnable"] && [options[@"tdrzEnable"] boolValue],
426
+ .total_n_new = 0,
381
427
  };
382
428
  params.new_segment_callback_user_data = &user_data;
383
429
  }
384
- int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
385
- self->recordState.jobId = -1;
430
+
431
+ rnwhisper::job* job = rnwhisper::job_new(jobId, params);
432
+ self->recordState.job = job;
433
+ int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
434
+ rnwhisper::job_remove(jobId);
386
435
  self->recordState.isTranscribing = false;
387
436
  onEnd(code);
388
437
  });
@@ -397,9 +446,13 @@ struct rnwhisper_segments_callback_data {
397
446
  }
398
447
 
399
448
  - (void)stopTranscribe:(int)jobId {
400
- rn_whisper_abort_transcribe(jobId);
449
+ if (self->recordState.job) self->recordState.job->abort();
401
450
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
402
451
  [self stopAudio];
452
+ if (!self->recordState.isTranscribing) {
453
+ // Handle for VAD case
454
+ self->recordState.transcribeHandler(jobId, @"end", @{});
455
+ }
403
456
  }
404
457
  self->recordState.isCapturing = false;
405
458
  self->recordState.isStoppedByAction = true;
@@ -407,13 +460,11 @@ struct rnwhisper_segments_callback_data {
407
460
  }
408
461
 
409
462
  - (void)stopCurrentTranscribe {
410
- if (!self->recordState.jobId) {
411
- return;
412
- }
413
- [self stopTranscribe:self->recordState.jobId];
463
+ if (self->recordState.job == nullptr) return;
464
+ [self stopTranscribe:self->recordState.job->job_id];
414
465
  }
415
466
 
416
- - (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId {
467
+ - (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
417
468
  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
418
469
 
419
470
  const int n_threads = options[@"maxThreads"] != nil ?
@@ -432,9 +483,8 @@ struct rnwhisper_segments_callback_data {
432
483
  params.print_progress = false;
433
484
  params.print_timestamps = false;
434
485
  params.print_special = false;
435
- params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
436
486
  params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
437
- params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
487
+ params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
438
488
  params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
439
489
  params.offset_ms = 0;
440
490
  params.no_context = true;
@@ -444,6 +494,7 @@ struct rnwhisper_segments_callback_data {
444
494
  params.max_len = [options[@"maxLen"] intValue];
445
495
  }
446
496
  params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
497
+ params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false;
447
498
 
448
499
  if (options[@"bestOf"] != nil) {
449
500
  params.greedy.best_of = [options[@"bestOf"] intValue];
@@ -451,7 +502,6 @@ struct rnwhisper_segments_callback_data {
451
502
  if (options[@"maxContext"] != nil) {
452
503
  params.n_max_text_ctx = [options[@"maxContext"] intValue];
453
504
  }
454
-
455
505
  if (options[@"offset"] != nil) {
456
506
  params.offset_ms = [options[@"offset"] intValue];
457
507
  }
@@ -467,35 +517,20 @@ struct rnwhisper_segments_callback_data {
467
517
  if (options[@"temperatureInc"] != nil) {
468
518
  params.temperature_inc = [options[@"temperature_inc"] floatValue];
469
519
  }
470
-
471
520
  if (options[@"prompt"] != nil) {
472
- params.initial_prompt = [options[@"prompt"] UTF8String];
521
+ params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
473
522
  }
474
523
 
475
- // abort handler
476
- params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
477
- bool is_aborted = *(bool*)user_data;
478
- return !is_aborted;
479
- };
480
- params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(jobId);
481
- params.abort_callback = [](void * user_data) {
482
- bool is_aborted = *(bool*)user_data;
483
- return is_aborted;
484
- };
485
- params.abort_callback_user_data = rn_whisper_assign_abort_map(jobId);
486
-
487
524
  return params;
488
525
  }
489
526
 
490
- - (int)fullTranscribe:(int)jobId
491
- params:(struct whisper_full_params)params
527
+ - (int)fullTranscribe:(rnwhisper::job *)job
492
528
  audioData:(float *)audioData
493
529
  audioDataCount:(int)audioDataCount
494
530
  {
495
531
  whisper_reset_timings(self->ctx);
496
-
497
- int code = whisper_full(self->ctx, params, audioData, audioDataCount);
498
- rn_whisper_remove_abort_map(jobId);
532
+ int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
533
+ if (job && job->is_aborted()) code = -999;
499
534
  // if (code == 0) {
500
535
  // whisper_print_timings(self->ctx);
501
536
  // }
@@ -509,12 +544,21 @@ struct rnwhisper_segments_callback_data {
509
544
  NSMutableArray *segments = [[NSMutableArray alloc] init];
510
545
  for (int i = 0; i < n_segments; i++) {
511
546
  const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
512
- text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
547
+ NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
548
+
549
+ // Simplified condition
550
+ if (self->recordState.options[@"tdrzEnable"] &&
551
+ [self->recordState.options[@"tdrzEnable"] boolValue] &&
552
+ whisper_full_get_segment_speaker_turn_next(self->ctx, i)) {
553
+ [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
554
+ }
555
+
556
+ text = [text stringByAppendingString:mutable_ns_text];
513
557
 
514
558
  const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
515
559
  const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
516
560
  NSDictionary *segment = @{
517
- @"text": [NSString stringWithUTF8String:text_cur],
561
+ @"text": [NSString stringWithString:mutable_ns_text],
518
562
  @"t0": [NSNumber numberWithLongLong:t0],
519
563
  @"t1": [NSNumber numberWithLongLong:t1]
520
564
  };
@@ -526,10 +570,19 @@ struct rnwhisper_segments_callback_data {
526
570
  return result;
527
571
  }
528
572
 
573
+ - (NSString *)bench:(int)maxThreads {
574
+ const int n_threads = maxThreads > 0 ? maxThreads : 0;
575
+
576
+ const int max_threads = (int) [[NSProcessInfo processInfo] processorCount];
577
+ // Use 2 threads by default on 4-core devices, 4 threads on more cores
578
+ const int default_n_threads = max_threads == 4 ? 2 : MIN(4, max_threads);
579
+ NSString *result = [NSString stringWithUTF8String:rnwhisper::bench(self->ctx, n_threads).c_str()];
580
+ return result;
581
+ }
582
+
529
583
  - (void)invalidate {
530
584
  [self stopCurrentTranscribe];
531
585
  whisper_free(self->ctx);
532
- [self freeBufferIfNeeded];
533
586
  }
534
587
 
535
588
  @end
package/jest/mock.js CHANGED
@@ -2,12 +2,17 @@ const { NativeModules, DeviceEventEmitter } = require('react-native')
2
2
 
3
3
  if (!NativeModules.RNWhisper) {
4
4
  NativeModules.RNWhisper = {
5
- initContext: jest.fn(() => Promise.resolve(1)),
5
+ initContext: jest.fn(() => Promise.resolve({ contextId: 1 })),
6
6
  transcribeFile: jest.fn(() => Promise.resolve({
7
7
  result: ' Test',
8
8
  segments: [{ text: ' Test', t0: 0, t1: 33 }],
9
9
  isAborted: false,
10
10
  })),
11
+ transcribeData: jest.fn(() => Promise.resolve({
12
+ result: ' Test',
13
+ segments: [{ text: ' Test', t0: 0, t1: 33 }],
14
+ isAborted: false,
15
+ })),
11
16
  startRealtimeTranscribe: jest.fn((contextId, jobId) => {
12
17
  setTimeout(() => {
13
18
  // Start
@@ -45,11 +50,19 @@ if (!NativeModules.RNWhisper) {
45
50
  })
46
51
  })
47
52
  }),
53
+ bench: jest.fn(() => Promise.resolve({
54
+ config: 'NEON',
55
+ nThreads: 1,
56
+ encodeMs: 1,
57
+ decodeMs: 1,
58
+ batchMs: 1,
59
+ promptMs: 1,
60
+ })),
48
61
  releaseContext: jest.fn(() => Promise.resolve()),
49
62
  releaseAllContexts: jest.fn(() => Promise.resolve()),
50
63
 
51
64
  // iOS AudioSession utils
52
- getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
65
+ getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
53
66
  category: 'AVAudioSessionCategoryPlayAndRecord',
54
67
  options: [],
55
68
  })),
@@ -1 +1 @@
1
- {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GAyFnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
1
+ {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0GnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}