whisper.rn 0.4.0-rc.4 → 0.4.0-rc.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +51 -133
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +187 -112
- package/cpp/README.md +1 -1
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +4 -2
- package/cpp/ggml-alloc.c +55 -19
- package/cpp/ggml-alloc.h +7 -0
- package/cpp/ggml-backend-impl.h +46 -21
- package/cpp/ggml-backend.c +563 -156
- package/cpp/ggml-backend.h +62 -17
- package/cpp/ggml-impl.h +1 -1
- package/cpp/ggml-metal-whisper.metal +1010 -253
- package/cpp/ggml-metal.h +7 -1
- package/cpp/ggml-metal.m +618 -187
- package/cpp/ggml-quants.c +64 -59
- package/cpp/ggml-quants.h +40 -40
- package/cpp/ggml.c +751 -1466
- package/cpp/ggml.h +90 -25
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +141 -59
- package/cpp/rn-whisper.h +47 -15
- package/cpp/whisper.cpp +1635 -928
- package/cpp/whisper.h +55 -10
- package/ios/RNWhisper.mm +7 -7
- package/ios/RNWhisperAudioUtils.h +0 -2
- package/ios/RNWhisperAudioUtils.m +0 -56
- package/ios/RNWhisperContext.h +3 -11
- package/ios/RNWhisperContext.mm +62 -134
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +6 -5
- package/src/version.json +1 -1
package/cpp/whisper.h
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#ifndef WHISPER_H
|
|
2
2
|
#define WHISPER_H
|
|
3
3
|
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
|
|
4
6
|
#include <stddef.h>
|
|
5
7
|
#include <stdint.h>
|
|
6
8
|
#include <stdbool.h>
|
|
@@ -48,7 +50,9 @@ extern "C" {
|
|
|
48
50
|
//
|
|
49
51
|
// ...
|
|
50
52
|
//
|
|
51
|
-
//
|
|
53
|
+
// whisper_context_params cparams = whisper_context_default_params();
|
|
54
|
+
//
|
|
55
|
+
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
|
52
56
|
//
|
|
53
57
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
54
58
|
// fprintf(stderr, "failed to process audio\n");
|
|
@@ -76,7 +80,9 @@ extern "C" {
|
|
|
76
80
|
struct whisper_state;
|
|
77
81
|
struct whisper_full_params;
|
|
78
82
|
|
|
79
|
-
typedef
|
|
83
|
+
typedef int32_t whisper_pos;
|
|
84
|
+
typedef int32_t whisper_token;
|
|
85
|
+
typedef int32_t whisper_seq_id;
|
|
80
86
|
|
|
81
87
|
struct whisper_context_params {
|
|
82
88
|
bool use_gpu;
|
|
@@ -108,18 +114,49 @@ extern "C" {
|
|
|
108
114
|
void (*close)(void * ctx);
|
|
109
115
|
} whisper_model_loader;
|
|
110
116
|
|
|
117
|
+
// grammar element type
|
|
118
|
+
enum whisper_gretype {
|
|
119
|
+
// end of rule definition
|
|
120
|
+
WHISPER_GRETYPE_END = 0,
|
|
121
|
+
|
|
122
|
+
// start of alternate definition for rule
|
|
123
|
+
WHISPER_GRETYPE_ALT = 1,
|
|
124
|
+
|
|
125
|
+
// non-terminal element: reference to rule
|
|
126
|
+
WHISPER_GRETYPE_RULE_REF = 2,
|
|
127
|
+
|
|
128
|
+
// terminal element: character (code point)
|
|
129
|
+
WHISPER_GRETYPE_CHAR = 3,
|
|
130
|
+
|
|
131
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
|
132
|
+
WHISPER_GRETYPE_CHAR_NOT = 4,
|
|
133
|
+
|
|
134
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
|
135
|
+
// be an inclusive range ([a-z])
|
|
136
|
+
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
|
|
137
|
+
|
|
138
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or
|
|
139
|
+
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
|
140
|
+
WHISPER_GRETYPE_CHAR_ALT = 6,
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
typedef struct whisper_grammar_element {
|
|
144
|
+
enum whisper_gretype type;
|
|
145
|
+
uint32_t value; // Unicode code point or rule ID
|
|
146
|
+
} whisper_grammar_element;
|
|
147
|
+
|
|
111
148
|
// Various functions for loading a ggml whisper model.
|
|
112
149
|
// Allocate (almost) all memory needed for the model.
|
|
113
150
|
// Return NULL on failure
|
|
114
|
-
WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model,
|
|
115
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,
|
|
116
|
-
WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
151
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
|
|
152
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
153
|
+
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
117
154
|
|
|
118
155
|
// These are the same as the above, but the internal state of the context is not allocated automatically
|
|
119
156
|
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
|
120
|
-
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model,
|
|
121
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,
|
|
122
|
-
WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
157
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
|
|
158
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
159
|
+
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
123
160
|
|
|
124
161
|
WHISPER_DEPRECATED(
|
|
125
162
|
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
|
|
@@ -279,6 +316,9 @@ extern "C" {
|
|
|
279
316
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
|
280
317
|
WHISPER_API const char * whisper_lang_str(int id);
|
|
281
318
|
|
|
319
|
+
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
|
|
320
|
+
WHISPER_API const char * whisper_lang_str_full(int id);
|
|
321
|
+
|
|
282
322
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
|
283
323
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
|
284
324
|
// Returns the top language id or negative on failure
|
|
@@ -401,6 +441,7 @@ extern "C" {
|
|
|
401
441
|
|
|
402
442
|
bool translate;
|
|
403
443
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
|
444
|
+
bool no_timestamps; // do not generate timestamps
|
|
404
445
|
bool single_segment; // force single segment output (useful for streaming)
|
|
405
446
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
|
406
447
|
bool print_progress; // print progress information
|
|
@@ -478,6 +519,11 @@ extern "C" {
|
|
|
478
519
|
// called by each decoder to filter obtained logits
|
|
479
520
|
whisper_logits_filter_callback logits_filter_callback;
|
|
480
521
|
void * logits_filter_callback_user_data;
|
|
522
|
+
|
|
523
|
+
const whisper_grammar_element ** grammar_rules;
|
|
524
|
+
size_t n_grammar_rules;
|
|
525
|
+
size_t i_start_rule;
|
|
526
|
+
float grammar_penalty;
|
|
481
527
|
};
|
|
482
528
|
|
|
483
529
|
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
|
@@ -571,8 +617,7 @@ extern "C" {
|
|
|
571
617
|
|
|
572
618
|
// Control logging output; default behavior is to print to stderr
|
|
573
619
|
|
|
574
|
-
|
|
575
|
-
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
|
620
|
+
WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
|
|
576
621
|
|
|
577
622
|
#ifdef __cplusplus
|
|
578
623
|
}
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -142,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
142
142
|
audioDataCount:count
|
|
143
143
|
options:options
|
|
144
144
|
onProgress: ^(int progress) {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
146
|
+
if (job && job->is_aborted()) return;
|
|
147
|
+
|
|
148
148
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
149
149
|
[self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
|
|
150
150
|
body:@{
|
|
@@ -156,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
156
156
|
});
|
|
157
157
|
}
|
|
158
158
|
onNewSegments: ^(NSDictionary *result) {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
160
|
+
if (job && job->is_aborted()) return;
|
|
161
|
+
|
|
162
162
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
163
163
|
[self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
|
|
164
164
|
body:@{
|
|
@@ -279,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
279
279
|
[context invalidate];
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
-
|
|
282
|
+
rnwhisper::job_abort_all(); // graceful abort
|
|
283
283
|
|
|
284
284
|
[contexts removeAllObjects];
|
|
285
285
|
contexts = nil;
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
@interface RNWhisperAudioUtils : NSObject
|
|
4
4
|
|
|
5
|
-
+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
|
|
6
|
-
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
|
|
7
5
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
|
|
8
6
|
|
|
9
7
|
@end
|
|
@@ -3,62 +3,6 @@
|
|
|
3
3
|
|
|
4
4
|
@implementation RNWhisperAudioUtils
|
|
5
5
|
|
|
6
|
-
+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
|
|
7
|
-
NSMutableData *outputData = [NSMutableData data];
|
|
8
|
-
for (int i = 0; i < buffers.count; i++) {
|
|
9
|
-
int size = [sliceNSamples objectAtIndex:i].intValue;
|
|
10
|
-
NSValue *buffer = [buffers objectAtIndex:i];
|
|
11
|
-
short *bufferPtr = buffer.pointerValue;
|
|
12
|
-
[outputData appendBytes:bufferPtr length:size * sizeof(short)];
|
|
13
|
-
}
|
|
14
|
-
return outputData;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
|
|
18
|
-
NSMutableData *outputData = [NSMutableData data];
|
|
19
|
-
|
|
20
|
-
// WAVE header
|
|
21
|
-
[outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
|
|
22
|
-
int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
|
|
23
|
-
[outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
|
|
24
|
-
[outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
|
|
25
|
-
[outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
|
|
26
|
-
|
|
27
|
-
int subchunk1Size = CFSwapInt32HostToLittle(16);
|
|
28
|
-
[outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
|
|
29
|
-
|
|
30
|
-
short audioFormat = CFSwapInt16HostToLittle(1); // PCM
|
|
31
|
-
[outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
|
|
32
|
-
|
|
33
|
-
short numChannels = CFSwapInt16HostToLittle(1); // mono
|
|
34
|
-
[outputData appendBytes:&numChannels length:sizeof(numChannels)];
|
|
35
|
-
|
|
36
|
-
int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
|
|
37
|
-
[outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
|
|
38
|
-
|
|
39
|
-
// (bitDepth * sampleRate * channels) >> 3
|
|
40
|
-
int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
|
|
41
|
-
[outputData appendBytes:&byteRate length:sizeof(byteRate)];
|
|
42
|
-
|
|
43
|
-
// (bitDepth * channels) >> 3
|
|
44
|
-
short blockAlign = CFSwapInt16HostToLittle(16 / 8);
|
|
45
|
-
[outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
|
|
46
|
-
|
|
47
|
-
// bitDepth
|
|
48
|
-
short bitsPerSample = CFSwapInt16HostToLittle(16);
|
|
49
|
-
[outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
|
|
50
|
-
|
|
51
|
-
[outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
|
|
52
|
-
int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
|
|
53
|
-
[outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
|
|
54
|
-
|
|
55
|
-
// Audio data
|
|
56
|
-
[outputData appendData:rawData];
|
|
57
|
-
|
|
58
|
-
// Save to file
|
|
59
|
-
[outputData writeToFile:audioOutputFile atomically:YES];
|
|
60
|
-
}
|
|
61
|
-
|
|
62
6
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
|
|
63
7
|
NSURL *url = [NSURL fileURLWithPath:filePath];
|
|
64
8
|
NSData *fileData = [NSData dataWithContentsOfURL:url];
|
package/ios/RNWhisperContext.h
CHANGED
|
@@ -11,29 +11,21 @@
|
|
|
11
11
|
|
|
12
12
|
typedef struct {
|
|
13
13
|
__unsafe_unretained id mSelf;
|
|
14
|
-
|
|
15
|
-
int jobId;
|
|
16
14
|
NSDictionary* options;
|
|
17
15
|
|
|
16
|
+
struct rnwhisper::job * job;
|
|
17
|
+
|
|
18
18
|
bool isTranscribing;
|
|
19
19
|
bool isRealtime;
|
|
20
20
|
bool isCapturing;
|
|
21
21
|
bool isStoppedByAction;
|
|
22
|
-
int maxAudioSec;
|
|
23
22
|
int nSamplesTranscribing;
|
|
24
|
-
|
|
25
|
-
NSMutableArray<NSNumber *> *sliceNSamples;
|
|
23
|
+
std::vector<int> sliceNSamples;
|
|
26
24
|
bool isUseSlices;
|
|
27
25
|
int sliceIndex;
|
|
28
26
|
int transcribeSliceIndex;
|
|
29
|
-
int audioSliceSec;
|
|
30
27
|
NSString* audioOutputPath;
|
|
31
28
|
|
|
32
|
-
bool useVad;
|
|
33
|
-
int vadMs;
|
|
34
|
-
float vadThold;
|
|
35
|
-
float vadFreqThold;
|
|
36
|
-
|
|
37
29
|
AudioQueueRef queue;
|
|
38
30
|
AudioStreamBasicDescription dataFormat;
|
|
39
31
|
AudioQueueBufferRef buffers[NUM_BUFFERS];
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#import "RNWhisperContext.h"
|
|
2
|
-
#import "RNWhisperAudioUtils.h"
|
|
3
2
|
#import <Metal/Metal.h>
|
|
4
3
|
#include <vector>
|
|
5
4
|
|
|
@@ -95,7 +94,7 @@
|
|
|
95
94
|
return self->dQueue;
|
|
96
95
|
}
|
|
97
96
|
|
|
98
|
-
- (void)prepareRealtime:(NSDictionary *)options {
|
|
97
|
+
- (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
|
|
99
98
|
self->recordState.options = options;
|
|
100
99
|
|
|
101
100
|
self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
|
|
@@ -108,74 +107,38 @@
|
|
|
108
107
|
self->recordState.dataFormat.mReserved = 0;
|
|
109
108
|
self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
110
109
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self->recordState.
|
|
114
|
-
|
|
115
|
-
int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
|
|
116
|
-
int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
|
|
117
|
-
|
|
118
|
-
self->recordState.audioOutputPath = options[@"audioOutputPath"];
|
|
119
|
-
|
|
120
|
-
self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
|
|
121
|
-
self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
|
|
122
|
-
if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
|
|
123
|
-
|
|
124
|
-
self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
|
|
125
|
-
self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
126
|
-
|
|
127
|
-
self->recordState.audioSliceSec = audioSliceSec;
|
|
128
|
-
self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
|
|
110
|
+
self->recordState.isRealtime = true;
|
|
111
|
+
self->recordState.isTranscribing = false;
|
|
112
|
+
self->recordState.isCapturing = false;
|
|
113
|
+
self->recordState.isStoppedByAction = false;
|
|
129
114
|
|
|
130
115
|
self->recordState.sliceIndex = 0;
|
|
131
116
|
self->recordState.transcribeSliceIndex = 0;
|
|
132
117
|
self->recordState.nSamplesTranscribing = 0;
|
|
133
118
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
119
|
+
self->recordState.sliceNSamples.push_back(0);
|
|
120
|
+
|
|
121
|
+
self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
|
|
122
|
+
self->recordState.job->set_realtime_params(
|
|
123
|
+
{
|
|
124
|
+
.use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
|
|
125
|
+
.vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
|
|
126
|
+
.vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
|
|
127
|
+
.freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
|
|
128
|
+
},
|
|
129
|
+
options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
|
|
130
|
+
options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
|
|
131
|
+
options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
|
|
132
|
+
);
|
|
133
|
+
self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
|
|
147
134
|
|
|
148
135
|
self->recordState.mSelf = self;
|
|
149
136
|
}
|
|
150
137
|
|
|
151
|
-
|
|
152
|
-
if (self->recordState.shortBufferSlices != nil) {
|
|
153
|
-
for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
|
|
154
|
-
int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
|
|
155
|
-
free(audioBufferI16);
|
|
156
|
-
}
|
|
157
|
-
self->recordState.shortBufferSlices = nil;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
|
|
138
|
+
bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
|
|
162
139
|
{
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
|
|
166
|
-
if (nSamples + n > sampleSize) {
|
|
167
|
-
int start = nSamples + n - sampleSize;
|
|
168
|
-
std::vector<float> audioBufferF32Vec(sampleSize);
|
|
169
|
-
for (int i = 0; i < sampleSize; i++) {
|
|
170
|
-
audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
|
|
171
|
-
}
|
|
172
|
-
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
|
|
173
|
-
NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
|
|
174
|
-
} else {
|
|
175
|
-
isSpeech = false;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
return isSpeech;
|
|
140
|
+
if (state->isTranscribing) return true;
|
|
141
|
+
return state->job->vad_simple(sliceIndex, nSamples, n);
|
|
179
142
|
}
|
|
180
143
|
|
|
181
144
|
void AudioInputCallback(void * inUserData,
|
|
@@ -196,15 +159,15 @@ void AudioInputCallback(void * inUserData,
|
|
|
196
159
|
}
|
|
197
160
|
|
|
198
161
|
int totalNSamples = 0;
|
|
199
|
-
for (int i = 0; i <
|
|
200
|
-
totalNSamples +=
|
|
162
|
+
for (int i = 0; i < state->sliceNSamples.size(); i++) {
|
|
163
|
+
totalNSamples += state->sliceNSamples[i];
|
|
201
164
|
}
|
|
202
165
|
|
|
203
166
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
204
167
|
|
|
205
|
-
int nSamples =
|
|
168
|
+
int nSamples = state->sliceNSamples[state->sliceIndex];
|
|
206
169
|
|
|
207
|
-
if (totalNSamples + n > state->
|
|
170
|
+
if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
|
|
208
171
|
NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
|
|
209
172
|
state->isCapturing = false;
|
|
210
173
|
[state->mSelf stopAudio];
|
|
@@ -218,8 +181,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
218
181
|
!state->isTranscribing &&
|
|
219
182
|
nSamples != state->nSamplesTranscribing
|
|
220
183
|
) {
|
|
221
|
-
|
|
222
|
-
if (!vad(state, audioBufferI16, nSamples, 0)) {
|
|
184
|
+
if (!vad(state, state->sliceIndex, nSamples, 0)) {
|
|
223
185
|
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
224
186
|
return;
|
|
225
187
|
}
|
|
@@ -231,27 +193,20 @@ void AudioInputCallback(void * inUserData,
|
|
|
231
193
|
return;
|
|
232
194
|
}
|
|
233
195
|
|
|
234
|
-
|
|
235
|
-
if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
|
|
196
|
+
if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
|
|
236
197
|
// next slice
|
|
237
198
|
state->sliceIndex++;
|
|
238
199
|
nSamples = 0;
|
|
239
|
-
|
|
240
|
-
[state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
|
|
241
|
-
[state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
|
|
200
|
+
state->sliceNSamples.push_back(0);
|
|
242
201
|
}
|
|
243
202
|
|
|
244
|
-
// Append to buffer
|
|
245
203
|
NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
|
|
246
204
|
|
|
247
|
-
|
|
248
|
-
for (int i = 0; i < n; i++) {
|
|
249
|
-
audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
250
|
-
}
|
|
205
|
+
state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
|
|
251
206
|
|
|
252
|
-
bool isSpeech = vad(state,
|
|
207
|
+
bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
|
|
253
208
|
nSamples += n;
|
|
254
|
-
state->sliceNSamples[state->sliceIndex] =
|
|
209
|
+
state->sliceNSamples[state->sliceIndex] = nSamples;
|
|
255
210
|
|
|
256
211
|
AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
|
|
257
212
|
|
|
@@ -267,32 +222,27 @@ void AudioInputCallback(void * inUserData,
|
|
|
267
222
|
|
|
268
223
|
- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
|
|
269
224
|
// Save wav if needed
|
|
270
|
-
if (state->
|
|
225
|
+
if (state->job->audio_output_path != nullptr) {
|
|
271
226
|
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
];
|
|
227
|
+
rnaudioutils::save_wav_file(
|
|
228
|
+
rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
|
|
229
|
+
state->job->audio_output_path
|
|
230
|
+
);
|
|
277
231
|
}
|
|
278
|
-
state->transcribeHandler(state->
|
|
232
|
+
state->transcribeHandler(state->job->job_id, @"end", result);
|
|
233
|
+
rnwhisper::job_remove(state->job->job_id);
|
|
279
234
|
}
|
|
280
235
|
|
|
281
236
|
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
282
|
-
int nSamplesOfIndex =
|
|
237
|
+
int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
283
238
|
state->nSamplesTranscribing = nSamplesOfIndex;
|
|
284
239
|
NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
|
|
285
240
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
// convert I16 to F32
|
|
289
|
-
for (int i = 0; i < state->nSamplesTranscribing; i++) {
|
|
290
|
-
audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
|
|
291
|
-
}
|
|
241
|
+
float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
|
|
242
|
+
|
|
292
243
|
CFTimeInterval timeStart = CACurrentMediaTime();
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
free(audioBufferF32);
|
|
244
|
+
int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
|
|
245
|
+
free(pcmf32);
|
|
296
246
|
CFTimeInterval timeEnd = CACurrentMediaTime();
|
|
297
247
|
const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
|
|
298
248
|
|
|
@@ -312,7 +262,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
312
262
|
result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
|
|
313
263
|
}
|
|
314
264
|
|
|
315
|
-
nSamplesOfIndex =
|
|
265
|
+
nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
316
266
|
|
|
317
267
|
bool isStopped = state->isStoppedByAction || (
|
|
318
268
|
!state->isCapturing &&
|
|
@@ -340,10 +290,10 @@ void AudioInputCallback(void * inUserData,
|
|
|
340
290
|
[state->mSelf finishRealtimeTranscribe:state result:result];
|
|
341
291
|
} else if (code == 0) {
|
|
342
292
|
result[@"isCapturing"] = @(true);
|
|
343
|
-
state->transcribeHandler(state->
|
|
293
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
344
294
|
} else {
|
|
345
295
|
result[@"isCapturing"] = @(true);
|
|
346
|
-
state->transcribeHandler(state->
|
|
296
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
347
297
|
}
|
|
348
298
|
|
|
349
299
|
if (continueNeeded) {
|
|
@@ -371,8 +321,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
371
321
|
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
|
|
372
322
|
{
|
|
373
323
|
self->recordState.transcribeHandler = onTranscribe;
|
|
374
|
-
self
|
|
375
|
-
[self prepareRealtime:options];
|
|
324
|
+
[self prepareRealtime:jobId options:options];
|
|
376
325
|
|
|
377
326
|
OSStatus status = AudioQueueNewInput(
|
|
378
327
|
&self->recordState.dataFormat,
|
|
@@ -413,9 +362,9 @@ struct rnwhisper_segments_callback_data {
|
|
|
413
362
|
dispatch_async(dQueue, ^{
|
|
414
363
|
self->recordState.isStoppedByAction = false;
|
|
415
364
|
self->recordState.isTranscribing = true;
|
|
416
|
-
self->recordState.jobId = jobId;
|
|
417
365
|
|
|
418
|
-
whisper_full_params params = [self
|
|
366
|
+
whisper_full_params params = [self createParams:options jobId:jobId];
|
|
367
|
+
|
|
419
368
|
if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
|
|
420
369
|
params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
|
421
370
|
void (^onProgress)(int) = (__bridge void (^)(int))user_data;
|
|
@@ -460,8 +409,10 @@ struct rnwhisper_segments_callback_data {
|
|
|
460
409
|
};
|
|
461
410
|
params.new_segment_callback_user_data = &user_data;
|
|
462
411
|
}
|
|
463
|
-
|
|
464
|
-
|
|
412
|
+
|
|
413
|
+
rnwhisper::job* job = rnwhisper::job_new(jobId, params);;
|
|
414
|
+
int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
|
|
415
|
+
rnwhisper::job_remove(jobId);
|
|
465
416
|
self->recordState.isTranscribing = false;
|
|
466
417
|
onEnd(code);
|
|
467
418
|
});
|
|
@@ -476,7 +427,7 @@ struct rnwhisper_segments_callback_data {
|
|
|
476
427
|
}
|
|
477
428
|
|
|
478
429
|
- (void)stopTranscribe:(int)jobId {
|
|
479
|
-
|
|
430
|
+
if (self->recordState.job) self->recordState.job->abort();
|
|
480
431
|
if (self->recordState.isRealtime && self->recordState.isCapturing) {
|
|
481
432
|
[self stopAudio];
|
|
482
433
|
if (!self->recordState.isTranscribing) {
|
|
@@ -490,13 +441,11 @@ struct rnwhisper_segments_callback_data {
|
|
|
490
441
|
}
|
|
491
442
|
|
|
492
443
|
- (void)stopCurrentTranscribe {
|
|
493
|
-
if (
|
|
494
|
-
|
|
495
|
-
}
|
|
496
|
-
[self stopTranscribe:self->recordState.jobId];
|
|
444
|
+
if (self->recordState.job == nullptr) return;
|
|
445
|
+
[self stopTranscribe:self->recordState.job->job_id];
|
|
497
446
|
}
|
|
498
447
|
|
|
499
|
-
- (struct whisper_full_params)
|
|
448
|
+
- (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
|
|
500
449
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
501
450
|
|
|
502
451
|
const int n_threads = options[@"maxThreads"] != nil ?
|
|
@@ -534,7 +483,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
534
483
|
if (options[@"maxContext"] != nil) {
|
|
535
484
|
params.n_max_text_ctx = [options[@"maxContext"] intValue];
|
|
536
485
|
}
|
|
537
|
-
|
|
538
486
|
if (options[@"offset"] != nil) {
|
|
539
487
|
params.offset_ms = [options[@"offset"] intValue];
|
|
540
488
|
}
|
|
@@ -550,39 +498,20 @@ struct rnwhisper_segments_callback_data {
|
|
|
550
498
|
if (options[@"temperatureInc"] != nil) {
|
|
551
499
|
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
552
500
|
}
|
|
553
|
-
|
|
554
501
|
if (options[@"prompt"] != nil) {
|
|
555
502
|
params.initial_prompt = [options[@"prompt"] UTF8String];
|
|
556
503
|
}
|
|
557
504
|
|
|
558
|
-
// abort handler
|
|
559
|
-
bool *abort_ptr = rn_whisper_assign_abort_map(jobId);
|
|
560
|
-
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
561
|
-
bool is_aborted = *(bool*)user_data;
|
|
562
|
-
return !is_aborted;
|
|
563
|
-
};
|
|
564
|
-
params.encoder_begin_callback_user_data = abort_ptr;
|
|
565
|
-
params.abort_callback = [](void * user_data) {
|
|
566
|
-
bool is_aborted = *(bool*)user_data;
|
|
567
|
-
return is_aborted;
|
|
568
|
-
};
|
|
569
|
-
params.abort_callback_user_data = abort_ptr;
|
|
570
|
-
|
|
571
505
|
return params;
|
|
572
506
|
}
|
|
573
507
|
|
|
574
|
-
- (int)fullTranscribe:(
|
|
575
|
-
params:(struct whisper_full_params)params
|
|
508
|
+
- (int)fullTranscribe:(rnwhisper::job *)job
|
|
576
509
|
audioData:(float *)audioData
|
|
577
510
|
audioDataCount:(int)audioDataCount
|
|
578
511
|
{
|
|
579
512
|
whisper_reset_timings(self->ctx);
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
if (rn_whisper_transcribe_is_aborted(jobId)) {
|
|
583
|
-
code = -999;
|
|
584
|
-
}
|
|
585
|
-
rn_whisper_remove_abort_map(jobId);
|
|
513
|
+
int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
|
|
514
|
+
if (job && job->is_aborted()) code = -999;
|
|
586
515
|
// if (code == 0) {
|
|
587
516
|
// whisper_print_timings(self->ctx);
|
|
588
517
|
// }
|
|
@@ -616,7 +545,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
616
545
|
- (void)invalidate {
|
|
617
546
|
[self stopCurrentTranscribe];
|
|
618
547
|
whisper_free(self->ctx);
|
|
619
|
-
[self freeBufferIfNeeded];
|
|
620
548
|
}
|
|
621
549
|
|
|
622
550
|
@end
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":"1.
|
|
1
|
+
{"version":"1.5.1"}
|
package/lib/module/version.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":"1.
|
|
1
|
+
{"version":"1.5.1"}
|