whisper.rn 0.4.0-rc.4 → 0.4.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +5 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
  5. package/android/src/main/java/com/rnwhisper/WhisperContext.java +51 -133
  6. package/android/src/main/jni-utils.h +76 -0
  7. package/android/src/main/jni.cpp +187 -112
  8. package/cpp/README.md +1 -1
  9. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  10. package/cpp/coreml/whisper-encoder.h +4 -0
  11. package/cpp/coreml/whisper-encoder.mm +4 -2
  12. package/cpp/ggml-alloc.c +55 -19
  13. package/cpp/ggml-alloc.h +7 -0
  14. package/cpp/ggml-backend-impl.h +46 -21
  15. package/cpp/ggml-backend.c +563 -156
  16. package/cpp/ggml-backend.h +62 -17
  17. package/cpp/ggml-impl.h +1 -1
  18. package/cpp/ggml-metal-whisper.metal +1010 -253
  19. package/cpp/ggml-metal.h +7 -1
  20. package/cpp/ggml-metal.m +618 -187
  21. package/cpp/ggml-quants.c +64 -59
  22. package/cpp/ggml-quants.h +40 -40
  23. package/cpp/ggml.c +751 -1466
  24. package/cpp/ggml.h +90 -25
  25. package/cpp/rn-audioutils.cpp +68 -0
  26. package/cpp/rn-audioutils.h +14 -0
  27. package/cpp/rn-whisper-log.h +11 -0
  28. package/cpp/rn-whisper.cpp +141 -59
  29. package/cpp/rn-whisper.h +47 -15
  30. package/cpp/whisper.cpp +1635 -928
  31. package/cpp/whisper.h +55 -10
  32. package/ios/RNWhisper.mm +7 -7
  33. package/ios/RNWhisperAudioUtils.h +0 -2
  34. package/ios/RNWhisperAudioUtils.m +0 -56
  35. package/ios/RNWhisperContext.h +3 -11
  36. package/ios/RNWhisperContext.mm +62 -134
  37. package/lib/commonjs/version.json +1 -1
  38. package/lib/module/version.json +1 -1
  39. package/package.json +6 -5
  40. package/src/version.json +1 -1
package/cpp/whisper.h CHANGED
@@ -1,6 +1,8 @@
1
1
  #ifndef WHISPER_H
2
2
  #define WHISPER_H
3
3
 
4
+ #include "ggml.h"
5
+
4
6
  #include <stddef.h>
5
7
  #include <stdint.h>
6
8
  #include <stdbool.h>
@@ -48,7 +50,9 @@ extern "C" {
48
50
  //
49
51
  // ...
50
52
  //
51
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
53
+ // whisper_context_params cparams = whisper_context_default_params();
54
+ //
55
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
52
56
  //
53
57
  // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
54
58
  // fprintf(stderr, "failed to process audio\n");
@@ -76,7 +80,9 @@ extern "C" {
76
80
  struct whisper_state;
77
81
  struct whisper_full_params;
78
82
 
79
- typedef int whisper_token;
83
+ typedef int32_t whisper_pos;
84
+ typedef int32_t whisper_token;
85
+ typedef int32_t whisper_seq_id;
80
86
 
81
87
  struct whisper_context_params {
82
88
  bool use_gpu;
@@ -108,18 +114,49 @@ extern "C" {
108
114
  void (*close)(void * ctx);
109
115
  } whisper_model_loader;
110
116
 
117
+ // grammar element type
118
+ enum whisper_gretype {
119
+ // end of rule definition
120
+ WHISPER_GRETYPE_END = 0,
121
+
122
+ // start of alternate definition for rule
123
+ WHISPER_GRETYPE_ALT = 1,
124
+
125
+ // non-terminal element: reference to rule
126
+ WHISPER_GRETYPE_RULE_REF = 2,
127
+
128
+ // terminal element: character (code point)
129
+ WHISPER_GRETYPE_CHAR = 3,
130
+
131
+ // inverse char(s) ([^a], [^a-b] [^abc])
132
+ WHISPER_GRETYPE_CHAR_NOT = 4,
133
+
134
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
135
+ // be an inclusive range ([a-z])
136
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
137
+
138
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
139
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
140
+ WHISPER_GRETYPE_CHAR_ALT = 6,
141
+ };
142
+
143
+ typedef struct whisper_grammar_element {
144
+ enum whisper_gretype type;
145
+ uint32_t value; // Unicode code point or rule ID
146
+ } whisper_grammar_element;
147
+
111
148
  // Various functions for loading a ggml whisper model.
112
149
  // Allocate (almost) all memory needed for the model.
113
150
  // Return NULL on failure
114
- WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params);
115
- WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
116
- WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
151
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
152
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
153
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
117
154
 
118
155
  // These are the same as the above, but the internal state of the context is not allocated automatically
119
156
  // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
120
- WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params);
121
- WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
122
- WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
157
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
158
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
159
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
123
160
 
124
161
  WHISPER_DEPRECATED(
125
162
  WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
@@ -279,6 +316,9 @@ extern "C" {
279
316
  // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
280
317
  WHISPER_API const char * whisper_lang_str(int id);
281
318
 
319
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
320
+ WHISPER_API const char * whisper_lang_str_full(int id);
321
+
282
322
  // Use mel data at offset_ms to try and auto-detect the spoken language
283
323
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
284
324
  // Returns the top language id or negative on failure
@@ -401,6 +441,7 @@ extern "C" {
401
441
 
402
442
  bool translate;
403
443
  bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
444
+ bool no_timestamps; // do not generate timestamps
404
445
  bool single_segment; // force single segment output (useful for streaming)
405
446
  bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
406
447
  bool print_progress; // print progress information
@@ -478,6 +519,11 @@ extern "C" {
478
519
  // called by each decoder to filter obtained logits
479
520
  whisper_logits_filter_callback logits_filter_callback;
480
521
  void * logits_filter_callback_user_data;
522
+
523
+ const whisper_grammar_element ** grammar_rules;
524
+ size_t n_grammar_rules;
525
+ size_t i_start_rule;
526
+ float grammar_penalty;
481
527
  };
482
528
 
483
529
  // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
@@ -571,8 +617,7 @@ extern "C" {
571
617
 
572
618
  // Control logging output; default behavior is to print to stderr
573
619
 
574
- typedef void (*whisper_log_callback)(const char * line);
575
- WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
620
+ WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
576
621
 
577
622
  #ifdef __cplusplus
578
623
  }
package/ios/RNWhisper.mm CHANGED
@@ -142,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
142
142
  audioDataCount:count
143
143
  options:options
144
144
  onProgress: ^(int progress) {
145
- if (rn_whisper_transcribe_is_aborted(jobId)) {
146
- return;
147
- }
145
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
146
+ if (job && job->is_aborted()) return;
147
+
148
148
  dispatch_async(dispatch_get_main_queue(), ^{
149
149
  [self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
150
150
  body:@{
@@ -156,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
156
156
  });
157
157
  }
158
158
  onNewSegments: ^(NSDictionary *result) {
159
- if (rn_whisper_transcribe_is_aborted(jobId)) {
160
- return;
161
- }
159
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
160
+ if (job && job->is_aborted()) return;
161
+
162
162
  dispatch_async(dispatch_get_main_queue(), ^{
163
163
  [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
164
164
  body:@{
@@ -279,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
279
279
  [context invalidate];
280
280
  }
281
281
 
282
- rn_whisper_abort_all_transcribe(); // graceful abort
282
+ rnwhisper::job_abort_all(); // graceful abort
283
283
 
284
284
  [contexts removeAllObjects];
285
285
  contexts = nil;
@@ -2,8 +2,6 @@
2
2
 
3
3
  @interface RNWhisperAudioUtils : NSObject
4
4
 
5
- + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
6
- + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
7
5
  + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
8
6
 
9
7
  @end
@@ -3,62 +3,6 @@
3
3
 
4
4
  @implementation RNWhisperAudioUtils
5
5
 
6
- + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
7
- NSMutableData *outputData = [NSMutableData data];
8
- for (int i = 0; i < buffers.count; i++) {
9
- int size = [sliceNSamples objectAtIndex:i].intValue;
10
- NSValue *buffer = [buffers objectAtIndex:i];
11
- short *bufferPtr = buffer.pointerValue;
12
- [outputData appendBytes:bufferPtr length:size * sizeof(short)];
13
- }
14
- return outputData;
15
- }
16
-
17
- + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
18
- NSMutableData *outputData = [NSMutableData data];
19
-
20
- // WAVE header
21
- [outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
22
- int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
23
- [outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
24
- [outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
25
- [outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
26
-
27
- int subchunk1Size = CFSwapInt32HostToLittle(16);
28
- [outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
29
-
30
- short audioFormat = CFSwapInt16HostToLittle(1); // PCM
31
- [outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
32
-
33
- short numChannels = CFSwapInt16HostToLittle(1); // mono
34
- [outputData appendBytes:&numChannels length:sizeof(numChannels)];
35
-
36
- int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
37
- [outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
38
-
39
- // (bitDepth * sampleRate * channels) >> 3
40
- int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
41
- [outputData appendBytes:&byteRate length:sizeof(byteRate)];
42
-
43
- // (bitDepth * channels) >> 3
44
- short blockAlign = CFSwapInt16HostToLittle(16 / 8);
45
- [outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
46
-
47
- // bitDepth
48
- short bitsPerSample = CFSwapInt16HostToLittle(16);
49
- [outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
50
-
51
- [outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
52
- int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
53
- [outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
54
-
55
- // Audio data
56
- [outputData appendData:rawData];
57
-
58
- // Save to file
59
- [outputData writeToFile:audioOutputFile atomically:YES];
60
- }
61
-
62
6
  + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
63
7
  NSURL *url = [NSURL fileURLWithPath:filePath];
64
8
  NSData *fileData = [NSData dataWithContentsOfURL:url];
@@ -11,29 +11,21 @@
11
11
 
12
12
  typedef struct {
13
13
  __unsafe_unretained id mSelf;
14
-
15
- int jobId;
16
14
  NSDictionary* options;
17
15
 
16
+ struct rnwhisper::job * job;
17
+
18
18
  bool isTranscribing;
19
19
  bool isRealtime;
20
20
  bool isCapturing;
21
21
  bool isStoppedByAction;
22
- int maxAudioSec;
23
22
  int nSamplesTranscribing;
24
- NSMutableArray<NSValue *> *shortBufferSlices;
25
- NSMutableArray<NSNumber *> *sliceNSamples;
23
+ std::vector<int> sliceNSamples;
26
24
  bool isUseSlices;
27
25
  int sliceIndex;
28
26
  int transcribeSliceIndex;
29
- int audioSliceSec;
30
27
  NSString* audioOutputPath;
31
28
 
32
- bool useVad;
33
- int vadMs;
34
- float vadThold;
35
- float vadFreqThold;
36
-
37
29
  AudioQueueRef queue;
38
30
  AudioStreamBasicDescription dataFormat;
39
31
  AudioQueueBufferRef buffers[NUM_BUFFERS];
@@ -1,5 +1,4 @@
1
1
  #import "RNWhisperContext.h"
2
- #import "RNWhisperAudioUtils.h"
3
2
  #import <Metal/Metal.h>
4
3
  #include <vector>
5
4
 
@@ -95,7 +94,7 @@
95
94
  return self->dQueue;
96
95
  }
97
96
 
98
- - (void)prepareRealtime:(NSDictionary *)options {
97
+ - (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
99
98
  self->recordState.options = options;
100
99
 
101
100
  self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
@@ -108,74 +107,38 @@
108
107
  self->recordState.dataFormat.mReserved = 0;
109
108
  self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
110
109
 
111
- int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
112
- int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
113
- self->recordState.maxAudioSec = maxAudioSec;
114
-
115
- int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
116
- int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
117
-
118
- self->recordState.audioOutputPath = options[@"audioOutputPath"];
119
-
120
- self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
121
- self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
122
- if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
123
-
124
- self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
125
- self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
126
-
127
- self->recordState.audioSliceSec = audioSliceSec;
128
- self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
110
+ self->recordState.isRealtime = true;
111
+ self->recordState.isTranscribing = false;
112
+ self->recordState.isCapturing = false;
113
+ self->recordState.isStoppedByAction = false;
129
114
 
130
115
  self->recordState.sliceIndex = 0;
131
116
  self->recordState.transcribeSliceIndex = 0;
132
117
  self->recordState.nSamplesTranscribing = 0;
133
118
 
134
- [self freeBufferIfNeeded];
135
- self->recordState.shortBufferSlices = [NSMutableArray new];
136
-
137
- int16_t *audioBufferI16 = (int16_t *) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
138
- [self->recordState.shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
139
-
140
- self->recordState.sliceNSamples = [NSMutableArray new];
141
- [self->recordState.sliceNSamples addObject:[NSNumber numberWithInt:0]];
142
-
143
- self->recordState.isRealtime = true;
144
- self->recordState.isTranscribing = false;
145
- self->recordState.isCapturing = false;
146
- self->recordState.isStoppedByAction = false;
119
+ self->recordState.sliceNSamples.push_back(0);
120
+
121
+ self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
122
+ self->recordState.job->set_realtime_params(
123
+ {
124
+ .use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
125
+ .vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
126
+ .vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
127
+ .freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
128
+ },
129
+ options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
130
+ options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
131
+ options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
132
+ );
133
+ self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
147
134
 
148
135
  self->recordState.mSelf = self;
149
136
  }
150
137
 
151
- - (void)freeBufferIfNeeded {
152
- if (self->recordState.shortBufferSlices != nil) {
153
- for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
154
- int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
155
- free(audioBufferI16);
156
- }
157
- self->recordState.shortBufferSlices = nil;
158
- }
159
- }
160
-
161
- bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
138
+ bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
162
139
  {
163
- bool isSpeech = true;
164
- if (!state->isTranscribing && state->useVad) {
165
- int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
166
- if (nSamples + n > sampleSize) {
167
- int start = nSamples + n - sampleSize;
168
- std::vector<float> audioBufferF32Vec(sampleSize);
169
- for (int i = 0; i < sampleSize; i++) {
170
- audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
171
- }
172
- isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
173
- NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
174
- } else {
175
- isSpeech = false;
176
- }
177
- }
178
- return isSpeech;
140
+ if (state->isTranscribing) return true;
141
+ return state->job->vad_simple(sliceIndex, nSamples, n);
179
142
  }
180
143
 
181
144
  void AudioInputCallback(void * inUserData,
@@ -196,15 +159,15 @@ void AudioInputCallback(void * inUserData,
196
159
  }
197
160
 
198
161
  int totalNSamples = 0;
199
- for (int i = 0; i < [state->sliceNSamples count]; i++) {
200
- totalNSamples += [[state->sliceNSamples objectAtIndex:i] intValue];
162
+ for (int i = 0; i < state->sliceNSamples.size(); i++) {
163
+ totalNSamples += state->sliceNSamples[i];
201
164
  }
202
165
 
203
166
  const int n = inBuffer->mAudioDataByteSize / 2;
204
167
 
205
- int nSamples = [state->sliceNSamples[state->sliceIndex] intValue];
168
+ int nSamples = state->sliceNSamples[state->sliceIndex];
206
169
 
207
- if (totalNSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
170
+ if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
208
171
  NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
209
172
  state->isCapturing = false;
210
173
  [state->mSelf stopAudio];
@@ -218,8 +181,7 @@ void AudioInputCallback(void * inUserData,
218
181
  !state->isTranscribing &&
219
182
  nSamples != state->nSamplesTranscribing
220
183
  ) {
221
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
222
- if (!vad(state, audioBufferI16, nSamples, 0)) {
184
+ if (!vad(state, state->sliceIndex, nSamples, 0)) {
223
185
  [state->mSelf finishRealtimeTranscribe:state result:@{}];
224
186
  return;
225
187
  }
@@ -231,27 +193,20 @@ void AudioInputCallback(void * inUserData,
231
193
  return;
232
194
  }
233
195
 
234
- int audioSliceSec = state->audioSliceSec;
235
- if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
196
+ if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
236
197
  // next slice
237
198
  state->sliceIndex++;
238
199
  nSamples = 0;
239
- int16_t* audioBufferI16 = (int16_t*) malloc(audioSliceSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
240
- [state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
241
- [state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
200
+ state->sliceNSamples.push_back(0);
242
201
  }
243
202
 
244
- // Append to buffer
245
203
  NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
246
204
 
247
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->sliceIndex] pointerValue];
248
- for (int i = 0; i < n; i++) {
249
- audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
250
- }
205
+ state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
251
206
 
252
- bool isSpeech = vad(state, audioBufferI16, nSamples, n);
207
+ bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
253
208
  nSamples += n;
254
- state->sliceNSamples[state->sliceIndex] = [NSNumber numberWithInt:nSamples];
209
+ state->sliceNSamples[state->sliceIndex] = nSamples;
255
210
 
256
211
  AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
257
212
 
@@ -267,32 +222,27 @@ void AudioInputCallback(void * inUserData,
267
222
 
268
223
  - (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
269
224
  // Save wav if needed
270
- if (state->audioOutputPath != nil) {
225
+ if (state->job->audio_output_path != nullptr) {
271
226
  // TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
272
- [RNWhisperAudioUtils
273
- saveWavFile:[RNWhisperAudioUtils concatShortBuffers:state->shortBufferSlices
274
- sliceNSamples:state->sliceNSamples]
275
- audioOutputFile:state->audioOutputPath
276
- ];
227
+ rnaudioutils::save_wav_file(
228
+ rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
229
+ state->job->audio_output_path
230
+ );
277
231
  }
278
- state->transcribeHandler(state->jobId, @"end", result);
232
+ state->transcribeHandler(state->job->job_id, @"end", result);
233
+ rnwhisper::job_remove(state->job->job_id);
279
234
  }
280
235
 
281
236
  - (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
282
- int nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
237
+ int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
283
238
  state->nSamplesTranscribing = nSamplesOfIndex;
284
239
  NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
285
240
 
286
- int16_t* audioBufferI16 = (int16_t*) [state->shortBufferSlices[state->transcribeSliceIndex] pointerValue];
287
- float* audioBufferF32 = (float*) malloc(state->nSamplesTranscribing * sizeof(float));
288
- // convert I16 to F32
289
- for (int i = 0; i < state->nSamplesTranscribing; i++) {
290
- audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
291
- }
241
+ float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
242
+
292
243
  CFTimeInterval timeStart = CACurrentMediaTime();
293
- struct whisper_full_params params = [state->mSelf getParams:state->options jobId:state->jobId];
294
- int code = [state->mSelf fullTranscribe:state->jobId params:params audioData:audioBufferF32 audioDataCount:state->nSamplesTranscribing];
295
- free(audioBufferF32);
244
+ int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
245
+ free(pcmf32);
296
246
  CFTimeInterval timeEnd = CACurrentMediaTime();
297
247
  const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
298
248
 
@@ -312,7 +262,7 @@ void AudioInputCallback(void * inUserData,
312
262
  result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
313
263
  }
314
264
 
315
- nSamplesOfIndex = [[state->sliceNSamples objectAtIndex:state->transcribeSliceIndex] intValue];
265
+ nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
316
266
 
317
267
  bool isStopped = state->isStoppedByAction || (
318
268
  !state->isCapturing &&
@@ -340,10 +290,10 @@ void AudioInputCallback(void * inUserData,
340
290
  [state->mSelf finishRealtimeTranscribe:state result:result];
341
291
  } else if (code == 0) {
342
292
  result[@"isCapturing"] = @(true);
343
- state->transcribeHandler(state->jobId, @"transcribe", result);
293
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
344
294
  } else {
345
295
  result[@"isCapturing"] = @(true);
346
- state->transcribeHandler(state->jobId, @"transcribe", result);
296
+ state->transcribeHandler(state->job->job_id, @"transcribe", result);
347
297
  }
348
298
 
349
299
  if (continueNeeded) {
@@ -371,8 +321,7 @@ void AudioInputCallback(void * inUserData,
371
321
  onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
372
322
  {
373
323
  self->recordState.transcribeHandler = onTranscribe;
374
- self->recordState.jobId = jobId;
375
- [self prepareRealtime:options];
324
+ [self prepareRealtime:jobId options:options];
376
325
 
377
326
  OSStatus status = AudioQueueNewInput(
378
327
  &self->recordState.dataFormat,
@@ -413,9 +362,9 @@ struct rnwhisper_segments_callback_data {
413
362
  dispatch_async(dQueue, ^{
414
363
  self->recordState.isStoppedByAction = false;
415
364
  self->recordState.isTranscribing = true;
416
- self->recordState.jobId = jobId;
417
365
 
418
- whisper_full_params params = [self getParams:options jobId:jobId];
366
+ whisper_full_params params = [self createParams:options jobId:jobId];
367
+
419
368
  if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
420
369
  params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
421
370
  void (^onProgress)(int) = (__bridge void (^)(int))user_data;
@@ -460,8 +409,10 @@ struct rnwhisper_segments_callback_data {
460
409
  };
461
410
  params.new_segment_callback_user_data = &user_data;
462
411
  }
463
- int code = [self fullTranscribe:jobId params:params audioData:audioData audioDataCount:audioDataCount];
464
- self->recordState.jobId = -1;
412
+
413
+ rnwhisper::job* job = rnwhisper::job_new(jobId, params);;
414
+ int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
415
+ rnwhisper::job_remove(jobId);
465
416
  self->recordState.isTranscribing = false;
466
417
  onEnd(code);
467
418
  });
@@ -476,7 +427,7 @@ struct rnwhisper_segments_callback_data {
476
427
  }
477
428
 
478
429
  - (void)stopTranscribe:(int)jobId {
479
- rn_whisper_abort_transcribe(jobId);
430
+ if (self->recordState.job) self->recordState.job->abort();
480
431
  if (self->recordState.isRealtime && self->recordState.isCapturing) {
481
432
  [self stopAudio];
482
433
  if (!self->recordState.isTranscribing) {
@@ -490,13 +441,11 @@ struct rnwhisper_segments_callback_data {
490
441
  }
491
442
 
492
443
  - (void)stopCurrentTranscribe {
493
- if (!self->recordState.jobId) {
494
- return;
495
- }
496
- [self stopTranscribe:self->recordState.jobId];
444
+ if (self->recordState.job == nullptr) return;
445
+ [self stopTranscribe:self->recordState.job->job_id];
497
446
  }
498
447
 
499
- - (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId {
448
+ - (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
500
449
  struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
501
450
 
502
451
  const int n_threads = options[@"maxThreads"] != nil ?
@@ -534,7 +483,6 @@ struct rnwhisper_segments_callback_data {
534
483
  if (options[@"maxContext"] != nil) {
535
484
  params.n_max_text_ctx = [options[@"maxContext"] intValue];
536
485
  }
537
-
538
486
  if (options[@"offset"] != nil) {
539
487
  params.offset_ms = [options[@"offset"] intValue];
540
488
  }
@@ -550,39 +498,20 @@ struct rnwhisper_segments_callback_data {
550
498
  if (options[@"temperatureInc"] != nil) {
551
499
  params.temperature_inc = [options[@"temperature_inc"] floatValue];
552
500
  }
553
-
554
501
  if (options[@"prompt"] != nil) {
555
502
  params.initial_prompt = [options[@"prompt"] UTF8String];
556
503
  }
557
504
 
558
- // abort handler
559
- bool *abort_ptr = rn_whisper_assign_abort_map(jobId);
560
- params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
561
- bool is_aborted = *(bool*)user_data;
562
- return !is_aborted;
563
- };
564
- params.encoder_begin_callback_user_data = abort_ptr;
565
- params.abort_callback = [](void * user_data) {
566
- bool is_aborted = *(bool*)user_data;
567
- return is_aborted;
568
- };
569
- params.abort_callback_user_data = abort_ptr;
570
-
571
505
  return params;
572
506
  }
573
507
 
574
- - (int)fullTranscribe:(int)jobId
575
- params:(struct whisper_full_params)params
508
+ - (int)fullTranscribe:(rnwhisper::job *)job
576
509
  audioData:(float *)audioData
577
510
  audioDataCount:(int)audioDataCount
578
511
  {
579
512
  whisper_reset_timings(self->ctx);
580
-
581
- int code = whisper_full(self->ctx, params, audioData, audioDataCount);
582
- if (rn_whisper_transcribe_is_aborted(jobId)) {
583
- code = -999;
584
- }
585
- rn_whisper_remove_abort_map(jobId);
513
+ int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
514
+ if (job && job->is_aborted()) code = -999;
586
515
  // if (code == 0) {
587
516
  // whisper_print_timings(self->ctx);
588
517
  // }
@@ -616,7 +545,6 @@ struct rnwhisper_segments_callback_data {
616
545
  - (void)invalidate {
617
546
  [self stopCurrentTranscribe];
618
547
  whisper_free(self->ctx);
619
- [self freeBufferIfNeeded];
620
548
  }
621
549
 
622
550
  @end
@@ -1 +1 @@
1
- {"version":"1.4.3"}
1
+ {"version":"1.5.1"}
@@ -1 +1 @@
1
- {"version":"1.4.3"}
1
+ {"version":"1.5.1"}