whisper.rn 0.4.0-rc.1 → 0.4.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +14 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
  7. package/android/src/main/jni-utils.h +76 -0
  8. package/android/src/main/jni.cpp +226 -109
  9. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  10. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  11. package/cpp/README.md +1 -1
  12. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  13. package/cpp/coreml/whisper-encoder.h +4 -0
  14. package/cpp/coreml/whisper-encoder.mm +5 -3
  15. package/cpp/ggml-aarch64.c +129 -0
  16. package/cpp/ggml-aarch64.h +19 -0
  17. package/cpp/ggml-alloc.c +805 -400
  18. package/cpp/ggml-alloc.h +60 -10
  19. package/cpp/ggml-backend-impl.h +216 -0
  20. package/cpp/ggml-backend-reg.cpp +204 -0
  21. package/cpp/ggml-backend.cpp +1996 -0
  22. package/cpp/ggml-backend.cpp.rej +12 -0
  23. package/cpp/ggml-backend.h +336 -0
  24. package/cpp/ggml-common.h +1853 -0
  25. package/cpp/ggml-cpp.h +38 -0
  26. package/cpp/ggml-cpu-aarch64.c +3560 -0
  27. package/cpp/ggml-cpu-aarch64.h +30 -0
  28. package/cpp/ggml-cpu-impl.h +371 -0
  29. package/cpp/ggml-cpu-quants.c +10822 -0
  30. package/cpp/ggml-cpu-quants.h +63 -0
  31. package/cpp/ggml-cpu.c +13970 -0
  32. package/cpp/ggml-cpu.cpp +663 -0
  33. package/cpp/ggml-cpu.h +177 -0
  34. package/cpp/ggml-impl.h +551 -0
  35. package/cpp/ggml-metal-impl.h +249 -0
  36. package/cpp/ggml-metal.h +24 -43
  37. package/cpp/ggml-metal.m +4190 -1075
  38. package/cpp/ggml-quants.c +5247 -0
  39. package/cpp/ggml-quants.h +100 -0
  40. package/cpp/ggml-threading.cpp +12 -0
  41. package/cpp/ggml-threading.h +12 -0
  42. package/cpp/ggml-whisper.metallib +0 -0
  43. package/cpp/ggml.c +5474 -18763
  44. package/cpp/ggml.h +833 -628
  45. package/cpp/rn-audioutils.cpp +68 -0
  46. package/cpp/rn-audioutils.h +14 -0
  47. package/cpp/rn-whisper-log.h +11 -0
  48. package/cpp/rn-whisper.cpp +221 -52
  49. package/cpp/rn-whisper.h +50 -15
  50. package/cpp/whisper.cpp +2872 -1371
  51. package/cpp/whisper.h +170 -41
  52. package/ios/RNWhisper.mm +139 -46
  53. package/ios/RNWhisperAudioUtils.h +1 -2
  54. package/ios/RNWhisperAudioUtils.m +18 -67
  55. package/ios/RNWhisperContext.h +11 -8
  56. package/ios/RNWhisperContext.mm +195 -150
  57. package/jest/mock.js +15 -2
  58. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  59. package/lib/commonjs/index.js +76 -28
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/commonjs/version.json +1 -1
  62. package/lib/module/NativeRNWhisper.js.map +1 -1
  63. package/lib/module/index.js +76 -28
  64. package/lib/module/index.js.map +1 -1
  65. package/lib/module/version.json +1 -1
  66. package/lib/typescript/NativeRNWhisper.d.ts +13 -4
  67. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  68. package/lib/typescript/index.d.ts +37 -5
  69. package/lib/typescript/index.d.ts.map +1 -1
  70. package/package.json +9 -7
  71. package/src/NativeRNWhisper.ts +20 -4
  72. package/src/index.ts +98 -42
  73. package/src/version.json +1 -1
  74. package/whisper-rn.podspec +11 -18
  75. package/cpp/ggml-metal.metal +0 -2353
@@ -1,30 +1,97 @@
1
1
  #import "RNWhisperContext.h"
2
- #import "RNWhisperAudioUtils.h"
2
+ #import <Metal/Metal.h>
3
3
  #include <vector>
4
4
 
5
5
  #define NUM_BYTES_PER_BUFFER 16 * 1024
6
6
 
7
7
  @implementation RNWhisperContext
8
8
 
9
- + (instancetype)initWithModelPath:(NSString *)modelPath contextId:(int)contextId noCoreML:(BOOL)noCoreML {
9
+ + (instancetype)initWithModelPath:(NSString *)modelPath
10
+ contextId:(int)contextId
11
+ noCoreML:(BOOL)noCoreML
12
+ noMetal:(BOOL)noMetal
13
+ useFlashAttn:(BOOL)useFlashAttn
14
+ {
10
15
  RNWhisperContext *context = [[RNWhisperContext alloc] init];
11
16
  context->contextId = contextId;
12
- #ifdef WHISPER_USE_COREML
13
- if (noCoreML) {
14
- context->ctx = whisper_init_from_file_no_coreml([modelPath UTF8String]);
15
- } else {
16
- context->ctx = whisper_init_from_file([modelPath UTF8String]);
17
+ struct whisper_context_params cparams;
18
+ NSString *reasonNoMetal = @"";
19
+ cparams.use_gpu = !noMetal;
20
+ cparams.flash_attn = useFlashAttn;
21
+
22
+ // TODO: Expose dtw_token_timestamps and dtw_aheads_preset
23
+ cparams.dtw_token_timestamps = false;
24
+ // cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
25
+
26
+ cparams.use_coreml = !noCoreML;
27
+ #ifndef WHISPER_USE_COREML
28
+ if (cparams.use_coreml) {
29
+ NSLog(@"[RNWhisper] CoreML is not enabled in this build, ignoring use_coreml option");
30
+ cparams.use_coreml = false;
31
+ }
32
+ #endif
33
+
34
+ #ifndef WSP_GGML_USE_METAL
35
+ if (cparams.use_gpu) {
36
+ NSLog(@"[RNWhisper] ggml-metal is not enabled in this build, ignoring use_gpu option");
37
+ cparams.use_gpu = false;
17
38
  }
18
- #else
19
- context->ctx = whisper_init_from_file([modelPath UTF8String]);
20
39
  #endif
40
+
41
+ #ifdef WSP_GGML_USE_METAL
42
+ if (cparams.use_gpu) {
43
+ #if TARGET_OS_SIMULATOR
44
+ NSLog(@"[RNWhisper] ggml-metal is not available in simulator, ignoring use_gpu option: %@", reasonNoMetal);
45
+ cparams.use_gpu = false;
46
+ #else // TARGET_OS_SIMULATOR
47
+ // Check ggml-metal availability
48
+ NSError * error = nil;
49
+ id<MTLDevice> device = MTLCreateSystemDefaultDevice();
50
+ id<MTLLibrary> library = [device
51
+ newLibraryWithSource:@"#include <metal_stdlib>\n"
52
+ "using namespace metal;"
53
+ "kernel void test() { simd_sum(0); }"
54
+ options:nil
55
+ error:&error
56
+ ];
57
+ if (error) {
58
+ reasonNoMetal = [error localizedDescription];
59
+ } else {
60
+ id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
61
+ id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
62
+ if (pipeline == nil) {
63
+ reasonNoMetal = [error localizedDescription];
64
+ NSLog(@"[RNWhisper] ggml-metal is not available, ignoring use_gpu option: %@", reasonNoMetal);
65
+ cparams.use_gpu = false;
66
+ }
67
+ }
68
+ #endif // TARGET_OS_SIMULATOR
69
+ }
70
+ #endif // WSP_GGML_USE_METAL
71
+
72
+ if (cparams.use_gpu && cparams.use_coreml) {
73
+ NSLog(@"[RNWhisper] Both use_gpu and use_coreml are enabled, ignoring use_coreml option");
74
+ cparams.use_coreml = false; // Skip CoreML if Metal is enabled
75
+ }
76
+
77
+ context->ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
21
78
  context->dQueue = dispatch_queue_create(
22
79
  [[NSString stringWithFormat:@"RNWhisperContext-%d", contextId] UTF8String],
23
80
  DISPATCH_QUEUE_SERIAL
24
81
  );
82
+ context->isMetalEnabled = cparams.use_gpu;
83
+ context->reasonNoMetal = reasonNoMetal;
25
84
  return context;
26
85
  }
27
86
 
87
+ - (bool)isMetalEnabled {
88
+ return isMetalEnabled;
89
+ }
90
+
91
+ - (NSString *)reasonNoMetal {
92
+ return reasonNoMetal;
93
+ }
94
+
28
95
  - (struct whisper_context *)getContext {
29
96
  return self->ctx;
30
97
  }
@@ -33,7 +100,7 @@
33
100
  return self->dQueue;
34
101
  }
35
102
 
36
- - (void)prepareRealtime:(NSDictionary *)options {
103
+ - (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
37
104
  self->recordState.options = options;
38
105
 
39
106
  self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
@@ -46,68 +113,40 @@
46
113
  self->recordState.dataFormat.mReserved = 0;
47
114
  self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
48
115
 
49
- int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
50
- int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
51
- self->recordState.maxAudioSec = maxAudioSec;
52
-
53
- int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
54
- int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
55
-
56
- self->recordState.audioSliceSec = audioSliceSec;
57
- self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
116
+ self->recordState.isRealtime = true;
117
+ self->recordState.isTranscribing = false;
118
+ self->recordState.isCapturing = false;
119
+ self->recordState.isStoppedByAction = false;
58
120
 
59
121
  self->recordState.sliceIndex = 0;
60
122
  self->recordState.transcribeSliceIndex = 0;
61
123
  self->recordState.nSamplesTranscribing = 0;
62
124
 
63
- [self freeBufferIfNeeded];
64
- self->recordState.shortBufferSlices = [NSMutableArray new];
65
-
66
- int16_t *audioBufferI16 = (int16_t *) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
67
- [self->recordState.shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
68
-
69
- self->recordState.sliceNSamples = [NSMutableArray new];
70
- [self->recordState.sliceNSamples addObject:[NSNumber numberWithInt:0]];
71
-
72
- self->recordState.isRealtime = true;
73
- self->recordState.isTranscribing = false;
74
- self->recordState.isCapturing = false;
75
- self->recordState.isStoppedByAction = false;
125
+ self->recordState.sliceNSamples.clear();
126
+ self->recordState.sliceNSamples.push_back(0);
127
+
128
+ self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
129
+ self->recordState.job->set_realtime_params(
130
+ {
131
+ .use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
132
+ .vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
133
+ .vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
134
+ .freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
135
+ },
136
+ options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
137
+ options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
138
+ options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
139
+ options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
140
+ );
141
+ self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
76
142
 
77
143
  self->recordState.mSelf = self;
78
144
  }
79
145
 
80
- - (void)freeBufferIfNeeded {
81
- if (self->recordState.shortBufferSlices != nil) {
82
- for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
83
- int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
84
- free(audioBufferI16);
85
- }
86
- self->recordState.shortBufferSlices = nil;
87
- }
88
- }
89
-
90
- bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
146
+ bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
91
147
  {
92
- bool isSpeech = true;
93
- if (!state->isTranscribing && state->options[@"useVad"]) {
94
- int vadSec = state->options[@"vadMs"] != nil ? [state->options[@"vadMs"] intValue] / 1000 : 2;
95
- int sampleSize = vadSec * WHISPER_SAMPLE_RATE;
96
- if (nSamples + n > sampleSize) {
97
- int start = nSamples + n - sampleSize;
98
- std::vector<float> audioBufferF32Vec(sampleSize);
99
- for (int i = 0; i < sampleSize; i++) {
100
- audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
101
- }
102
- float vadThold = state->options[@"vadThold"] != nil ? [state->options[@"vadThold"] floatValue] : 0.6f;
103
- float vadFreqThold = state->options[@"vadFreqThold"] != nil ? [state->options[@"vadFreqThold"] floatValue] : 100.0f;
104
- isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, vadThold, vadFreqThold, false);
105
- NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
106
- } else {
107
- isSpeech = false;
108
- }
109
- }
110
- return isSpeech;
148
+ if (state->isTranscribing) return true;
149
+ return state->job->vad_simple(sliceIndex, nSamples, n);
111
150
  }
112
151
 
113
152
  void AudioInputCallback(void * inUserData,
@@ -122,21 +161,21 @@ void AudioInputCallback(void * inUserData,
122
161
  if (!state->isCapturing) {
123
162
  NSLog(@"[RNWhisper] Not capturing, ignoring audio");
124
163
  if (!state->isTranscribing) {
125
- state->transcribeHandler(state->jobId, @"end", @{});
164
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
126
165
  }
127
166
  return;
128
167
  }
129
168
 
130
169
  int totalNSamples = 0;
131
- for (int i = 0; i < [state->sliceNSamples count]; i++) {
132
- totalNSamples += [[state->sliceNSamples objectAtIndex:i] intValue];
170
+ for (int i = 0; i < state->sliceNSamples.size(); i++) {
171
+ totalNSamples += state->sliceNSamples[i];
133
172
  }
134
173
 
135
174
  const int n = inBuffer->mAudioDataByteSize / 2;
136
175
 
137
- int nSamples = [state->sliceNSamples[state->sliceIndex] intValue];
176
+ int nSamples = state->sliceNSamples[state->sliceIndex];
138
177
 
139
- if (totalNSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
178
+ if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
140
179
  NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
141
180
  state->isCapturing = false;
142
181
  [state->mSelf stopAudio];
@@ -145,14 +184,14 @@ void AudioInputCallback(void * inUserData,
145
184
  nSamples == state->nSamplesTranscribing &&
146
185
  state->sliceIndex == state->transcribeSliceIndex
147
186
  ) {
148
- state->transcribeHandler(state->jobId, @"end", @{});
187
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
149
188
  } else if (
150
189
  !state->isTranscribing &&
151
190
  nSamples != state->nSamplesTranscribing
152
191
  ) {
153
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
154
- if (!vad(state, audioBufferI16, nSamples, 0)) {
155
- state->transcribeHandler(state->jobId, @"end", @{});
192
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
193
+ if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
194
+ [state->mSelf finishRealtimeTranscribe:state result:@{}];
156
195
  return;
157
196
  }
158
197
  state->isTranscribing = true;
@@ -163,31 +202,25 @@ void AudioInputCallback(void * inUserData,
163
202
  return;
164
203
  }
165
204
 
166
- int audioSliceSec = state->audioSliceSec;
167
- if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
205
+ if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
168
206
  // next slice
169
207
  state->sliceIndex++;
170
208
  nSamples = 0;
171
- int16_t* audioBufferI16 = (int16_t*) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
172
- [state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
173
- [state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
209
+ state->sliceNSamples.push_back(0);
174
210
  }
175
211
 
176
- // Append to buffer
177
- NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
212
+ NSLog(@"[RNWhisper] Slice %d has %d samples, put %d samples", state->sliceIndex, nSamples, n);
178
213
 
179
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
180
- for (int i = 0; i < n; i++) {
181
- audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
182
- }
214
+ state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
183
215
 
184
- bool isSpeech = vad(state, audioBufferI16, nSamples, n);
216
+ bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
185
217
  nSamples += n;
186
- state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
218
+ state->sliceNSamples[state->sliceIndex] = nSamples;
187
219
 
188
220
  AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
189
221
 
190
- if (!isSpeech) return;
222
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
223
+ if (!isSamplesEnough || !isSpeech) return;
191
224
 
192
225
  if (!state->isTranscribing) {
193
226
  state->isTranscribing = true;
@@ -197,21 +230,29 @@ void AudioInputCallback(void * inUserData,
197
230
  }
198
231
  }
199
232
 
233
+ - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
234
+ // Save wav if needed
235
+ if (state->job->audio_output_path != nullptr) {
236
+ // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
237
+ rnaudioutils::save_wav_file(
238
+ rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
239
+ state->job->audio_output_path
240
+ );
241
+ }
242
+ state->transcribeHandler(state->job->job_id, @"end", result);
243
+ rnwhisper::job_remove(state->job->job_id);
244
+ }
245
+
200
246
  - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
201
- int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
247
+ int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
202
248
  state->nSamplesTranscribing = nSamplesOfIndex;
203
249
  NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
204
250
 
205
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->transcribeSliceIndex] pointerValue];
206
- float* audioBufferF32 = (float*) malloc(state->nSamplesTranscribing * sizeof(float));
207
- // convert I16 to F32
208
- for (int i = 0; i < state->nSamplesTranscribing; i++) {
209
- audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
210
- }
251
+ float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
252
+
211
253
  CFTimeInterval timeStart = CACurrentMediaTime();
212
- struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
213
- int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
214
- free(audioBufferF32);
254
+ int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
255
+ free(pcmf32);
215
256
  CFTimeInterval timeEnd = CACurrentMediaTime();
216
257
  const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
217
258
 
@@ -231,7 +272,7 @@ void AudioInputCallback(void * inUserData,
231
272
  result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
232
273
  }
233
274
 
234
- nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
275
+ nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
235
276
 
236
277
  bool isStopped = state->isStoppedByAction || (
237
278
  !state->isCapturing &&
@@ -256,23 +297,13 @@ void AudioInputCallback(void * inUserData,
256
297
  result[@"isStoppedByAction"] = @(state->isStoppedByAction);
257
298
  result[@"isCapturing"] = @(false);
258
299
 
259
- // Save wav if needed
260
- if (state->options[@"audioOutputPath"] != nil) {
261
- // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
262
- [RNWhisperAudioUtils
263
- saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
264
- sliceNSamples:state->sliceNSamples]
265
- audioOutputFile:state->options[@"audioOutputPath"]
266
- ];
267
- }
268
-
269
- state->transcribeHandler(state->jobId, @"end", result);
300
+ [state->mSelf finishRealtimeTranscribe:state result:result];
270
301
  } else if (code == 0) {
271
302
  result[@"isCapturing"] = @(true);
272
- state->transcribeHandler(state->jobId, @"transcribe", result);
303
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
273
304
  } else {
274
305
  result[@"isCapturing"] = @(true);
275
- state->transcribeHandler(state->jobId, @"transcribe", result);
306
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
276
307
  }
277
308
 
278
309
  if (continueNeeded) {
@@ -300,8 +331,7 @@ void AudioInputCallback(void * inUserData,
300
331
  onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
301
332
  {
302
333
  self->recordState.transcribeHandler = onTranscribe;
303
- self->recordState.jobId = jobId;
304
- [self prepareRealtime:options];
334
+ [self prepareRealtime:jobId options:options];
305
335
 
306
336
  OSStatus status = AudioQueueNewInput(
307
337
  &self->recordState.dataFormat,
@@ -329,9 +359,10 @@ void AudioInputCallback(void * inUserData,
329
359
  struct rnwhisper_segments_callback_data {
330
360
  void (^onNewSegments)(NSDictionary *);
331
361
  int total_n_new;
362
+ bool tdrzEnable;
332
363
  };
333
364
 
334
- - (void)transcribeFile:(int)jobId
365
+ - (void)transcribeData:(int)jobId
335
366
  audioData:(float *)audioData
336
367
  audioDataCount:(int)audioDataCount
337
368
  options:(NSDictionary *)options
@@ -342,9 +373,9 @@ struct rnwhisper_segments_callback_data {
342
373
  dispatch_async(dQueue, ^{
343
374
  self->recordState.isStoppedByAction = false;
344
375
  self->recordState.isTranscribing = true;
345
- self->recordState.jobId = jobId;
346
376
 
347
- whisper_full_params params = [self getParams:options jobId:jobId];
377
+ whisper_full_params params = [self createParams:options jobId:jobId];
378
+
348
379
  if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
349
380
  params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
350
381
  void (^onProgress)(int) = (__bridge void (^)(int))user_data;
@@ -362,12 +393,18 @@ struct rnwhisper_segments_callback_data {
362
393
  NSMutableArray *segments = [[NSMutableArray alloc] init];
363
394
  for (int i = data->total_n_new - n_new; i < data->total_n_new; i++) {
364
395
  const char * text_cur = whisper_full_get_segment_text(ctx, i);
365
- text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
396
+ NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
397
+
398
+ if (data->tdrzEnable && whisper_full_get_segment_speaker_turn_next(ctx, i)) {
399
+ [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
400
+ }
401
+
402
+ text = [text stringByAppendingString:mutable_ns_text];
366
403
 
367
404
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
368
405
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
369
406
  NSDictionary *segment = @{
370
- @"text": [NSString stringWithUTF8String:text_cur],
407
+ @"text": [NSString stringWithString:mutable_ns_text],
371
408
  @"t0": [NSNumber numberWithLongLong:t0],
372
409
  @"t1": [NSNumber numberWithLongLong:t1]
373
410
  };
@@ -385,12 +422,16 @@ struct rnwhisper_segments_callback_data {
385
422
  };
386
423
  struct rnwhisper_segments_callback_data user_data = {
387
424
  .onNewSegments = onNewSegments,
388
- .total_n_new = 0
425
+ .tdrzEnable = options[@"tdrzEnable"] && [options[@"tdrzEnable"] boolValue],
426
+ .total_n_new = 0,
389
427
  };
390
428
  params.new_segment_callback_user_data = &user_data;
391
429
  }
392
- int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
393
- self->recordState.jobId = -1;
430
+
431
+ rnwhisper::job* job = rnwhisper::job_new(jobId, params);
432
+ self->recordState.job = job;
433
+ int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
434
+ rnwhisper::job_remove(jobId);
394
435
  self->recordState.isTranscribing = false;
395
436
  onEnd(code);
396
437
  });
@@ -405,9 +446,13 @@ struct rnwhisper_segments_callback_data {
405
446
  }
406
447
 
407
448
  - (void)stopTranscribe:(int)jobId {
408
- rn_whisper_abort_transcribe(jobId);
449
+ if (self->recordState.job) self->recordState.job->abort();
409
450
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
410
451
  [self stopAudio];
452
+ if (!self->recordState.isTranscribing) {
453
+ // Handle for VAD case
454
+ self->recordState.transcribeHandler(jobId, @"end", @{});
455
+ }
411
456
  }
412
457
  self->recordState.isCapturing = false;
413
458
  self->recordState.isStoppedByAction = true;
@@ -415,13 +460,11 @@ struct rnwhisper_segments_callback_data {
415
460
  }
416
461
 
417
462
  - (void)stopCurrentTranscribe {
418
- if (!self->recordState.jobId) {
419
- return;
420
- }
421
- [self stopTranscribe:self->recordState.jobId];
463
+ if (self->recordState.job == nullptr) return;
464
+ [self stopTranscribe:self->recordState.job->job_id];
422
465
  }
423
466
 
424
- - (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId {
467
+ - (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
425
468
  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
426
469
 
427
470
  const int n_threads = options[@"maxThreads"] != nil ?
@@ -440,9 +483,8 @@ struct rnwhisper_segments_callback_data {
440
483
  params.print_progress = false;
441
484
  params.print_timestamps = false;
442
485
  params.print_special = false;
443
- params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
444
486
  params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
445
- params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
487
+ params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
446
488
  params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
447
489
  params.offset_ms = 0;
448
490
  params.no_context = true;
@@ -452,6 +494,7 @@ struct rnwhisper_segments_callback_data {
452
494
  params.max_len = [options[@"maxLen"] intValue];
453
495
  }
454
496
  params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
497
+ params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false;
455
498
 
456
499
  if (options[@"bestOf"] != nil) {
457
500
  params.greedy.best_of = [options[@"bestOf"] intValue];
@@ -459,7 +502,6 @@ struct rnwhisper_segments_callback_data {
459
502
  if (options[@"maxContext"] != nil) {
460
503
  params.n_max_text_ctx = [options[@"maxContext"] intValue];
461
504
  }
462
-
463
505
  if (options[@"offset"] != nil) {
464
506
  params.offset_ms = [options[@"offset"] intValue];
465
507
  }
@@ -475,35 +517,20 @@ struct rnwhisper_segments_callback_data {
475
517
  if (options[@"temperatureInc"] != nil) {
476
518
  params.temperature_inc = [options[@"temperature_inc"] floatValue];
477
519
  }
478
-
479
520
  if (options[@"prompt"] != nil) {
480
- params.initial_prompt = [options[@"prompt"] UTF8String];
521
+ params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
481
522
  }
482
523
 
483
- // abort handler
484
- params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
485
- bool is_aborted = *(bool*)user_data;
486
- return !is_aborted;
487
- };
488
- params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(jobId);
489
- params.abort_callback = [](void * user_data) {
490
- bool is_aborted = *(bool*)user_data;
491
- return is_aborted;
492
- };
493
- params.abort_callback_user_data = rn_whisper_assign_abort_map(jobId);
494
-
495
524
  return params;
496
525
  }
497
526
 
498
- - (int)fullTranscribe:(int)jobId
499
- params:(struct whisper_full_params)params
527
+ - (int)fullTranscribe:(rnwhisper::job *)job
500
528
  audioData:(float *)audioData
501
529
  audioDataCount:(int)audioDataCount
502
530
  {
503
531
  whisper_reset_timings(self->ctx);
504
-
505
- int code = whisper_full(self->ctx, params, audioData, audioDataCount);
506
- rn_whisper_remove_abort_map(jobId);
532
+ int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
533
+ if (job && job->is_aborted()) code = -999;
507
534
  // if (code == 0) {
508
535
  // whisper_print_timings(self->ctx);
509
536
  // }
@@ -517,12 +544,21 @@ struct rnwhisper_segments_callback_data {
517
544
  NSMutableArray *segments = [[NSMutableArray alloc] init];
518
545
  for (int i = 0; i < n_segments; i++) {
519
546
  const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
520
- text = [text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
547
+ NSMutableString *mutable_ns_text = [NSMutableString stringWithUTF8String:text_cur];
548
+
549
+ // Simplified condition
550
+ if (self->recordState.options[@"tdrzEnable"] &&
551
+ [self->recordState.options[@"tdrzEnable"] boolValue] &&
552
+ whisper_full_get_segment_speaker_turn_next(self->ctx, i)) {
553
+ [mutable_ns_text appendString:@" [SPEAKER_TURN]"];
554
+ }
555
+
556
+ text = [text stringByAppendingString:mutable_ns_text];
521
557
 
522
558
  const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
523
559
  const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
524
560
  NSDictionary *segment = @{
525
- @"text": [NSString stringWithUTF8String:text_cur],
561
+ @"text": [NSString stringWithString:mutable_ns_text],
526
562
  @"t0": [NSNumber numberWithLongLong:t0],
527
563
  @"t1": [NSNumber numberWithLongLong:t1]
528
564
  };
@@ -534,10 +570,19 @@ struct rnwhisper_segments_callback_data {
534
570
  return result;
535
571
  }
536
572
 
573
+ - (NSString *)bench:(int)maxThreads {
574
+ const int n_threads = maxThreads > 0 ? maxThreads : 0;
575
+
576
+ const int max_threads = (int) [[NSProcessInfo processInfo] processorCount];
577
+ // Use 2 threads by default on 4-core devices, 4 threads on more cores
578
+ const int default_n_threads = max_threads == 4 ? 2 : MIN(4, max_threads);
579
+ NSString *result = [NSString stringWithUTF8String:rnwhisper::bench(self->ctx, n_threads).c_str()];
580
+ return result;
581
+ }
582
+
537
583
  - (void)invalidate {
538
584
  [self stopCurrentTranscribe];
539
585
  whisper_free(self->ctx);
540
- [self freeBufferIfNeeded];
541
586
  }
542
587
 
543
588
  @end
package/jest/mock.js CHANGED
@@ -2,12 +2,17 @@ const { NativeModules, DeviceEventEmitter } = require('react-native')
2
2
 
3
3
  if (!NativeModules.RNWhisper) {
4
4
  NativeModules.RNWhisper = {
5
- initContext: jest.fn(() => Promise.resolve(1)),
5
+ initContext: jest.fn(() => Promise.resolve({ contextId: 1 })),
6
6
  transcribeFile: jest.fn(() => Promise.resolve({
7
7
  result: ' Test',
8
8
  segments: [{ text: ' Test', t0: 0, t1: 33 }],
9
9
  isAborted: false,
10
10
  })),
11
+ transcribeData: jest.fn(() => Promise.resolve({
12
+ result: ' Test',
13
+ segments: [{ text: ' Test', t0: 0, t1: 33 }],
14
+ isAborted: false,
15
+ })),
11
16
  startRealtimeTranscribe: jest.fn((contextId, jobId) => {
12
17
  setTimeout(() => {
13
18
  // Start
@@ -45,11 +50,19 @@ if (!NativeModules.RNWhisper) {
45
50
  })
46
51
  })
47
52
  }),
53
+ bench: jest.fn(() => Promise.resolve({
54
+ config: 'NEON',
55
+ nThreads: 1,
56
+ encodeMs: 1,
57
+ decodeMs: 1,
58
+ batchMs: 1,
59
+ promptMs: 1,
60
+ })),
48
61
  releaseContext: jest.fn(() => Promise.resolve()),
49
62
  releaseAllContexts: jest.fn(() => Promise.resolve()),
50
63
 
51
64
  // iOS AudioSession utils
52
- getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
65
+ getAudioSessionCurrentCategory: jest.fn(() => Promise.resolve({
53
66
  category: 'AVAudioSessionCategoryPlayAndRecord',
54
67
  options: [],
55
68
  })),
@@ -1 +1 @@
1
- {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0FnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
1
+ {"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"../../src","sources":["NativeRNWhisper.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA0GnCC,gCAAmB,CAACC,GAAG,CAAO,WAAW,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}