whisper.rn 0.4.0-rc.4 → 0.4.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +5 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +57 -134
  6. package/android/src/main/jni-utils.h +76 -0
  7. package/android/src/main/jni.cpp +188 -112
  8. package/cpp/README.md +1 -1
  9. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  10. package/cpp/coreml/whisper-encoder.h +4 -0
  11. package/cpp/coreml/whisper-encoder.mm +4 -2
  12. package/cpp/ggml-alloc.c +55 -19
  13. package/cpp/ggml-alloc.h +8 -1
  14. package/cpp/ggml-backend-impl.h +46 -21
  15. package/cpp/ggml-backend.c +563 -156
  16. package/cpp/ggml-backend.h +62 -17
  17. package/cpp/ggml-impl.h +1 -1
  18. package/cpp/ggml-metal-whisper.metal +2444 -359
  19. package/cpp/ggml-metal.h +7 -1
  20. package/cpp/ggml-metal.m +1105 -197
  21. package/cpp/ggml-quants.c +66 -61
  22. package/cpp/ggml-quants.h +40 -40
  23. package/cpp/ggml.c +1040 -1590
  24. package/cpp/ggml.h +109 -30
  25. package/cpp/rn-audioutils.cpp +68 -0
  26. package/cpp/rn-audioutils.h +14 -0
  27. package/cpp/rn-whisper-log.h +11 -0
  28. package/cpp/rn-whisper.cpp +143 -59
  29. package/cpp/rn-whisper.h +48 -15
  30. package/cpp/whisper.cpp +1635 -928
  31. package/cpp/whisper.h +55 -10
  32. package/ios/RNWhisper.mm +7 -7
  33. package/ios/RNWhisperAudioUtils.h +0 -2
  34. package/ios/RNWhisperAudioUtils.m +0 -56
  35. package/ios/RNWhisperContext.h +3 -11
  36. package/ios/RNWhisperContext.mm +68 -137
  37. package/lib/commonjs/index.js.map +1 -1
  38. package/lib/commonjs/version.json +1 -1
  39. package/lib/module/index.js.map +1 -1
  40. package/lib/module/version.json +1 -1
  41. package/lib/typescript/index.d.ts +5 -0
  42. package/lib/typescript/index.d.ts.map +1 -1
  43. package/package.json +6 -5
  44. package/src/index.ts +5 -0
  45. package/src/version.json +1 -1
  46. package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
  47. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
  48. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  49. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
package/cpp/whisper.h CHANGED
@@ -1,6 +1,8 @@
1
1
  #ifndef WHISPER_H
2
2
  #define WHISPER_H
3
3
 
4
+ #include "ggml.h"
5
+
4
6
  #include <stddef.h>
5
7
  #include <stdint.h>
6
8
  #include <stdbool.h>
@@ -48,7 +50,9 @@ extern "C" {
48
50
  //
49
51
  // ...
50
52
  //
51
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
53
+ // whisper_context_params cparams = whisper_context_default_params();
54
+ //
55
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
52
56
  //
53
57
  // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
54
58
  // fprintf(stderr, "failed to process audio\n");
@@ -76,7 +80,9 @@ extern "C" {
76
80
  struct whisper_state;
77
81
  struct whisper_full_params;
78
82
 
79
- typedef int whisper_token;
83
+ typedef int32_t whisper_pos;
84
+ typedef int32_t whisper_token;
85
+ typedef int32_t whisper_seq_id;
80
86
 
81
87
  struct whisper_context_params {
82
88
  bool use_gpu;
@@ -108,18 +114,49 @@ extern "C" {
108
114
  void (*close)(void * ctx);
109
115
  } whisper_model_loader;
110
116
 
117
+ // grammar element type
118
+ enum whisper_gretype {
119
+ // end of rule definition
120
+ WHISPER_GRETYPE_END = 0,
121
+
122
+ // start of alternate definition for rule
123
+ WHISPER_GRETYPE_ALT = 1,
124
+
125
+ // non-terminal element: reference to rule
126
+ WHISPER_GRETYPE_RULE_REF = 2,
127
+
128
+ // terminal element: character (code point)
129
+ WHISPER_GRETYPE_CHAR = 3,
130
+
131
+ // inverse char(s) ([^a], [^a-b] [^abc])
132
+ WHISPER_GRETYPE_CHAR_NOT = 4,
133
+
134
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
135
+ // be an inclusive range ([a-z])
136
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
137
+
138
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
139
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
140
+ WHISPER_GRETYPE_CHAR_ALT = 6,
141
+ };
142
+
143
+ typedef struct whisper_grammar_element {
144
+ enum whisper_gretype type;
145
+ uint32_t value; // Unicode code point or rule ID
146
+ } whisper_grammar_element;
147
+
111
148
  // Various functions for loading a ggml whisper model.
112
149
  // Allocate (almost) all memory needed for the model.
113
150
  // Return NULL on failure
114
- WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params);
115
- WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
116
- WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
151
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
152
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
153
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
117
154
 
118
155
  // These are the same as the above, but the internal state of the context is not allocated automatically
119
156
  // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
120
- WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params);
121
- WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
122
- WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
157
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
158
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
159
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
123
160
 
124
161
  WHISPER_DEPRECATED(
125
162
  WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
@@ -279,6 +316,9 @@ extern "C" {
279
316
  // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
280
317
  WHISPER_API const char * whisper_lang_str(int id);
281
318
 
319
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
320
+ WHISPER_API const char * whisper_lang_str_full(int id);
321
+
282
322
  // Use mel data at offset_ms to try and auto-detect the spoken language
283
323
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
284
324
  // Returns the top language id or negative on failure
@@ -401,6 +441,7 @@ extern "C" {
401
441
 
402
442
  bool translate;
403
443
  bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
444
+ bool no_timestamps; // do not generate timestamps
404
445
  bool single_segment; // force single segment output (useful for streaming)
405
446
  bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
406
447
  bool print_progress; // print progress information
@@ -478,6 +519,11 @@ extern "C" {
478
519
  // called by each decoder to filter obtained logits
479
520
  whisper_logits_filter_callback logits_filter_callback;
480
521
  void * logits_filter_callback_user_data;
522
+
523
+ const whisper_grammar_element ** grammar_rules;
524
+ size_t n_grammar_rules;
525
+ size_t i_start_rule;
526
+ float grammar_penalty;
481
527
  };
482
528
 
483
529
  // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
@@ -571,8 +617,7 @@ extern "C" {
571
617
 
572
618
  // Control logging output; default behavior is to print to stderr
573
619
 
574
- typedef void (*whisper_log_callback)(const char * line);
575
- WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
620
+ WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
576
621
 
577
622
  #ifdef __cplusplus
578
623
  }
package/ios/RNWhisper.mm CHANGED
@@ -142,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
142
142
  audioDataCount:count
143
143
  options:options
144
144
  onProgress: ^(int progress) {
145
- if (rn_whisper_transcribe_is_aborted(jobId)) {
146
- return;
147
- }
145
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
146
+ if (job && job->is_aborted()) return;
147
+
148
148
  dispatch_async(dispatch_get_main_queue(), ^{
149
149
  [self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
150
150
  body:@{
@@ -156,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
156
156
  });
157
157
  }
158
158
  onNewSegments: ^(NSDictionary *result) {
159
- if (rn_whisper_transcribe_is_aborted(jobId)) {
160
- return;
161
- }
159
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
160
+ if (job && job->is_aborted()) return;
161
+
162
162
  dispatch_async(dispatch_get_main_queue(), ^{
163
163
  [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
164
164
  body:@{
@@ -279,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
279
279
  [context invalidate];
280
280
  }
281
281
 
282
- rn_whisper_abort_all_transcribe(); // graceful abort
282
+ rnwhisper::job_abort_all(); // graceful abort
283
283
 
284
284
  [contexts removeAllObjects];
285
285
  contexts = nil;
@@ -2,8 +2,6 @@
2
2
 
3
3
  @interface RNWhisperAudioUtils : NSObject
4
4
 
5
- + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
6
- + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
7
5
  + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
8
6
 
9
7
  @end
@@ -3,62 +3,6 @@
3
3
 
4
4
  @implementation RNWhisperAudioUtils
5
5
 
6
- + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
7
- NSMutableData *outputData = [NSMutableData data];
8
- for (int i = 0; i < buffers.count; i++) {
9
- int size = [sliceNSamples objectAtIndex:i].intValue;
10
- NSValue *buffer = [buffers objectAtIndex:i];
11
- short *bufferPtr = buffer.pointerValue;
12
- [outputData appendBytes:bufferPtr length:size * sizeof(short)];
13
- }
14
- return outputData;
15
- }
16
-
17
- + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
18
- NSMutableData *outputData = [NSMutableData data];
19
-
20
- // WAVE header
21
- [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
22
- int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
23
- [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
24
- [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
25
- [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
26
-
27
- int subchunk1Size = CFSwapInt32HostToLittle(16);
28
- [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
29
-
30
- short audioFormat = CFSwapInt16HostToLittle(1); // PCM
31
- [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
32
-
33
- short numChannels = CFSwapInt16HostToLittle(1); // mono
34
- [outputData appendBytes:&numChannels length:sizeof(numChannels)];
35
-
36
- int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
37
- [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
38
-
39
- // (bitDepth * sampleRate * channels) >> 3
40
- int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
41
- [outputData appendBytes:&byteRate length:sizeof(byteRate)];
42
-
43
- // (bitDepth * channels) >> 3
44
- short blockAlign = CFSwapInt16HostToLittle(16 / 8);
45
- [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
46
-
47
- // bitDepth
48
- short bitsPerSample = CFSwapInt16HostToLittle(16);
49
- [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
50
-
51
- [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
52
- int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
53
- [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
54
-
55
- // Audio data
56
- [outputData appendData:rawData];
57
-
58
- // Save to file
59
- [outputData writeToFile:audioOutputFile atomically:YES];
60
- }
61
-
62
6
  + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
63
7
  NSURL *url = [NSURL fileURLWithPath:filePath];
64
8
  NSData *fileData = [NSData dataWithContentsOfURL:url];
@@ -11,29 +11,21 @@
11
11
 
12
12
  typedef struct {
13
13
  __unsafe_unretained id mSelf;
14
-
15
- int jobId;
16
14
  NSDictionary* options;
17
15
 
16
+ struct rnwhisper::job * job;
17
+
18
18
  bool isTranscribing;
19
19
  bool isRealtime;
20
20
  bool isCapturing;
21
21
  bool isStoppedByAction;
22
- int maxAudioSec;
23
22
  int nSamplesTranscribing;
24
- NSMutableArray<NSValue *> *shortBufferSlices;
25
- NSMutableArray<NSNumber *> *sliceNSamples;
23
+ std::vector<int> sliceNSamples;
26
24
  bool isUseSlices;
27
25
  int sliceIndex;
28
26
  int transcribeSliceIndex;
29
- int audioSliceSec;
30
27
  NSString* audioOutputPath;
31
28
 
32
- bool useVad;
33
- int vadMs;
34
- float vadThold;
35
- float vadFreqThold;
36
-
37
29
  AudioQueueRef queue;
38
30
  AudioStreamBasicDescription dataFormat;
39
31
  AudioQueueBufferRef buffers[NUM_BUFFERS];
@@ -1,5 +1,4 @@
1
1
  #import "RNWhisperContext.h"
2
- #import "RNWhisperAudioUtils.h"
3
2
  #import <Metal/Metal.h>
4
3
  #include <vector>
5
4
 
@@ -95,7 +94,7 @@
95
94
  return self->dQueue;
96
95
  }
97
96
 
98
- - (void)prepareRealtime:(NSDictionary *)options {
97
+ - (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
99
98
  self->recordState.options = options;
100
99
 
101
100
  self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
@@ -108,74 +107,39 @@
108
107
  self->recordState.dataFormat.mReserved = 0;
109
108
  self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
110
109
 
111
- int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
112
- int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
113
- self->recordState.maxAudioSec = maxAudioSec;
114
-
115
- int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
116
- int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
117
-
118
- self->recordState.audioOutputPath = options[@"audioOutputPath"];
119
-
120
- self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
121
- self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
122
- if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
123
-
124
- self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
125
- self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
126
-
127
- self->recordState.audioSliceSec = audioSliceSec;
128
- self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
110
+ self->recordState.isRealtime = true;
111
+ self->recordState.isTranscribing = false;
112
+ self->recordState.isCapturing = false;
113
+ self->recordState.isStoppedByAction = false;
129
114
 
130
115
  self->recordState.sliceIndex = 0;
131
116
  self->recordState.transcribeSliceIndex = 0;
132
117
  self->recordState.nSamplesTranscribing = 0;
133
118
 
134
- [self freeBufferIfNeeded];
135
- self->recordState.shortBufferSlices = [NSMutableArray new];
136
-
137
- int16_t *audioBufferI16 = (int16_t *) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
138
- [self->recordState.shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
139
-
140
- self->recordState.sliceNSamples = [NSMutableArray new];
141
- [self->recordState.sliceNSamples addObject:[NSNumber numberWithInt:0]];
142
-
143
- self->recordState.isRealtime = true;
144
- self->recordState.isTranscribing = false;
145
- self->recordState.isCapturing = false;
146
- self->recordState.isStoppedByAction = false;
119
+ self->recordState.sliceNSamples.push_back(0);
120
+
121
+ self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
122
+ self->recordState.job->set_realtime_params(
123
+ {
124
+ .use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
125
+ .vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
126
+ .vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
127
+ .freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
128
+ },
129
+ options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
130
+ options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
131
+ options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
132
+ options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
133
+ );
134
+ self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
147
135
 
148
136
  self->recordState.mSelf = self;
149
137
  }
150
138
 
151
- - (void)freeBufferIfNeeded {
152
- if (self->recordState.shortBufferSlices != nil) {
153
- for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
154
- int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
155
- free(audioBufferI16);
156
- }
157
- self->recordState.shortBufferSlices = nil;
158
- }
159
- }
160
-
161
- bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
139
+ bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
162
140
  {
163
- bool isSpeech = true;
164
- if (!state->isTranscribing && state->useVad) {
165
- int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
166
- if (nSamples + n > sampleSize) {
167
- int start = nSamples + n - sampleSize;
168
- std::vector<float> audioBufferF32Vec(sampleSize);
169
- for (int i = 0; i < sampleSize; i++) {
170
- audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
171
- }
172
- isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
173
- NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
174
- } else {
175
- isSpeech = false;
176
- }
177
- }
178
- return isSpeech;
141
+ if (state->isTranscribing) return true;
142
+ return state->job->vad_simple(sliceIndex, nSamples, n);
179
143
  }
180
144
 
181
145
  void AudioInputCallback(void * inUserData,
@@ -196,15 +160,15 @@ void AudioInputCallback(void * inUserData,
196
160
  }
197
161
 
198
162
  int totalNSamples = 0;
199
- for (int i = 0; i < [state->sliceNSamples count]; i++) {
200
- totalNSamples += [[state->sliceNSamples objectAtIndex:i] intValue];
163
+ for (int i = 0; i < state->sliceNSamples.size(); i++) {
164
+ totalNSamples += state->sliceNSamples[i];
201
165
  }
202
166
 
203
167
  const int n = inBuffer->mAudioDataByteSize / 2;
204
168
 
205
- int nSamples = [state->sliceNSamples[state->sliceIndex] intValue];
169
+ int nSamples = state->sliceNSamples[state->sliceIndex];
206
170
 
207
- if (totalNSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
171
+ if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
208
172
  NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
209
173
  state->isCapturing = false;
210
174
  [state->mSelf stopAudio];
@@ -218,8 +182,8 @@ void AudioInputCallback(void * inUserData,
218
182
  !state->isTranscribing &&
219
183
  nSamples != state->nSamplesTranscribing
220
184
  ) {
221
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
222
- if (!vad(state, audioBufferI16, nSamples, 0)) {
185
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
186
+ if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
223
187
  [state->mSelf finishRealtimeTranscribe:state result:@{}];
224
188
  return;
225
189
  }
@@ -231,31 +195,25 @@ void AudioInputCallback(void * inUserData,
231
195
  return;
232
196
  }
233
197
 
234
- int audioSliceSec = state->audioSliceSec;
235
- if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
198
+ if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
236
199
  // next slice
237
200
  state->sliceIndex++;
238
201
  nSamples = 0;
239
- int16_t* audioBufferI16 = (int16_t*) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
240
- [state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
241
- [state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
202
+ state->sliceNSamples.push_back(0);
242
203
  }
243
204
 
244
- // Append to buffer
245
205
  NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
246
206
 
247
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
248
- for (int i = 0; i < n; i++) {
249
- audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
250
- }
207
+ state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
251
208
 
252
- bool isSpeech = vad(state, audioBufferI16, nSamples, n);
209
+ bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
253
210
  nSamples += n;
254
- state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
211
+ state->sliceNSamples[state->sliceIndex] = nSamples;
255
212
 
256
213
  AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
257
214
 
258
- if (!isSpeech) return;
215
+ bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
216
+ if (!isSamplesEnough || !isSpeech) return;
259
217
 
260
218
  if (!state->isTranscribing) {
261
219
  state->isTranscribing = true;
@@ -267,32 +225,27 @@ void AudioInputCallback(void * inUserData,
267
225
 
268
226
  - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
269
227
  // Save wav if needed
270
- if (state->audioOutputPath != nil) {
228
+ if (state->job->audio_output_path != nullptr) {
271
229
  // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
272
- [RNWhisperAudioUtils
273
- saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
274
- sliceNSamples:state->sliceNSamples]
275
- audioOutputFile:state->audioOutputPath
276
- ];
230
+ rnaudioutils::save_wav_file(
231
+ rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
232
+ state->job->audio_output_path
233
+ );
277
234
  }
278
- state->transcribeHandler(state->jobId, @"end", result);
235
+ state->transcribeHandler(state->job->job_id, @"end", result);
236
+ rnwhisper::job_remove(state->job->job_id);
279
237
  }
280
238
 
281
239
  - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
282
- int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
240
+ int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
283
241
  state->nSamplesTranscribing = nSamplesOfIndex;
284
242
  NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
285
243
 
286
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->transcribeSliceIndex] pointerValue];
287
- float* audioBufferF32 = (float*) malloc(state->nSamplesTranscribing * sizeof(float));
288
- // convert I16 to F32
289
- for (int i = 0; i < state->nSamplesTranscribing; i++) {
290
- audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
291
- }
244
+ float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
245
+
292
246
  CFTimeInterval timeStart = CACurrentMediaTime();
293
- struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
294
- int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
295
- free(audioBufferF32);
247
+ int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
248
+ free(pcmf32);
296
249
  CFTimeInterval timeEnd = CACurrentMediaTime();
297
250
  const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
298
251
 
@@ -312,7 +265,7 @@ void AudioInputCallback(void * inUserData,
312
265
  result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
313
266
  }
314
267
 
315
- nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
268
+ nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
316
269
 
317
270
  bool isStopped = state->isStoppedByAction || (
318
271
  !state->isCapturing &&
@@ -340,10 +293,10 @@ void AudioInputCallback(void * inUserData,
340
293
  [state->mSelf finishRealtimeTranscribe:state result:result];
341
294
  } else if (code == 0) {
342
295
  result[@"isCapturing"] = @(true);
343
- state->transcribeHandler(state->jobId, @"transcribe", result);
296
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
344
297
  } else {
345
298
  result[@"isCapturing"] = @(true);
346
- state->transcribeHandler(state->jobId, @"transcribe", result);
299
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
347
300
  }
348
301
 
349
302
  if (continueNeeded) {
@@ -371,8 +324,7 @@ void AudioInputCallback(void * inUserData,
371
324
  onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
372
325
  {
373
326
  self->recordState.transcribeHandler = onTranscribe;
374
- self->recordState.jobId = jobId;
375
- [self prepareRealtime:options];
327
+ [self prepareRealtime:jobId options:options];
376
328
 
377
329
  OSStatus status = AudioQueueNewInput(
378
330
  &self->recordState.dataFormat,
@@ -413,9 +365,9 @@ struct rnwhisper_segments_callback_data {
413
365
  dispatch_async(dQueue, ^{
414
366
  self->recordState.isStoppedByAction = false;
415
367
  self->recordState.isTranscribing = true;
416
- self->recordState.jobId = jobId;
417
368
 
418
- whisper_full_params params = [self getParams:options jobId:jobId];
369
+ whisper_full_params params = [self createParams:options jobId:jobId];
370
+
419
371
  if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
420
372
  params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
421
373
  void (^onProgress)(int) = (__bridge void (^)(int))user_data;
@@ -460,8 +412,10 @@ struct rnwhisper_segments_callback_data {
460
412
  };
461
413
  params.new_segment_callback_user_data = &user_data;
462
414
  }
463
- int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
464
- self->recordState.jobId = -1;
415
+
416
+ rnwhisper::job* job = rnwhisper::job_new(jobId, params);;
417
+ int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
418
+ rnwhisper::job_remove(jobId);
465
419
  self->recordState.isTranscribing = false;
466
420
  onEnd(code);
467
421
  });
@@ -476,7 +430,7 @@ struct rnwhisper_segments_callback_data {
476
430
  }
477
431
 
478
432
  - (void)stopTranscribe:(int)jobId {
479
- rn_whisper_abort_transcribe(jobId);
433
+ if (self->recordState.job) self->recordState.job->abort();
480
434
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
481
435
  [self stopAudio];
482
436
  if (!self->recordState.isTranscribing) {
@@ -490,13 +444,11 @@ struct rnwhisper_segments_callback_data {
490
444
  }
491
445
 
492
446
  - (void)stopCurrentTranscribe {
493
- if (!self->recordState.jobId) {
494
- return;
495
- }
496
- [self stopTranscribe:self->recordState.jobId];
447
+ if (self->recordState.job == nullptr) return;
448
+ [self stopTranscribe:self->recordState.job->job_id];
497
449
  }
498
450
 
499
- - (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId {
451
+ - (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
500
452
  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
501
453
 
502
454
  const int n_threads = options[@"maxThreads"] != nil ?
@@ -517,7 +469,7 @@ struct rnwhisper_segments_callback_data {
517
469
  params.print_special = false;
518
470
  params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
519
471
  params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
520
- params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
472
+ params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
521
473
  params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
522
474
  params.offset_ms = 0;
523
475
  params.no_context = true;
@@ -534,7 +486,6 @@ struct rnwhisper_segments_callback_data {
534
486
  if (options[@"maxContext"] != nil) {
535
487
  params.n_max_text_ctx = [options[@"maxContext"] intValue];
536
488
  }
537
-
538
489
  if (options[@"offset"] != nil) {
539
490
  params.offset_ms = [options[@"offset"] intValue];
540
491
  }
@@ -550,39 +501,20 @@ struct rnwhisper_segments_callback_data {
550
501
  if (options[@"temperatureInc"] != nil) {
551
502
  params.temperature_inc = [options[@"temperature_inc"] floatValue];
552
503
  }
553
-
554
504
  if (options[@"prompt"] != nil) {
555
- params.initial_prompt = [options[@"prompt"] UTF8String];
505
+ params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
556
506
  }
557
507
 
558
- // abort handler
559
- bool *abort_ptr = rn_whisper_assign_abort_map(jobId);
560
- params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
561
- bool is_aborted = *(bool*)user_data;
562
- return !is_aborted;
563
- };
564
- params.encoder_begin_callback_user_data = abort_ptr;
565
- params.abort_callback = [](void * user_data) {
566
- bool is_aborted = *(bool*)user_data;
567
- return is_aborted;
568
- };
569
- params.abort_callback_user_data = abort_ptr;
570
-
571
508
  return params;
572
509
  }
573
510
 
574
- - (int)fullTranscribe:(int)jobId
575
- params:(struct whisper_full_params)params
511
+ - (int)fullTranscribe:(rnwhisper::job *)job
576
512
  audioData:(float *)audioData
577
513
  audioDataCount:(int)audioDataCount
578
514
  {
579
515
  whisper_reset_timings(self->ctx);
580
-
581
- int code = whisper_full(self->ctx, params, audioData, audioDataCount);
582
- if (rn_whisper_transcribe_is_aborted(jobId)) {
583
- code = -999;
584
- }
585
- rn_whisper_remove_abort_map(jobId);
516
+ int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
517
+ if (job && job->is_aborted()) code = -999;
586
518
  // if (code == 0) {
587
519
  // whisper_print_timings(self->ctx);
588
520
  // }
@@ -616,7 +548,6 @@ struct rnwhisper_segments_callback_data {
616
548
  - (void)invalidate {
617
549
  [self stopCurrentTranscribe];
618
550
  whisper_free(self->ctx);
619
- [self freeBufferIfNeeded];
620
551
  }
621
552
 
622
553
  @end