whisper.rn 0.4.0-rc.4 → 0.4.0-rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +5 -0
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +57 -134
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +188 -112
- package/cpp/README.md +1 -1
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +4 -2
- package/cpp/ggml-alloc.c +55 -19
- package/cpp/ggml-alloc.h +8 -1
- package/cpp/ggml-backend-impl.h +46 -21
- package/cpp/ggml-backend.c +563 -156
- package/cpp/ggml-backend.h +62 -17
- package/cpp/ggml-impl.h +1 -1
- package/cpp/ggml-metal-whisper.metal +2444 -359
- package/cpp/ggml-metal.h +7 -1
- package/cpp/ggml-metal.m +1105 -197
- package/cpp/ggml-quants.c +66 -61
- package/cpp/ggml-quants.h +40 -40
- package/cpp/ggml.c +1040 -1590
- package/cpp/ggml.h +109 -30
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +143 -59
- package/cpp/rn-whisper.h +48 -15
- package/cpp/whisper.cpp +1635 -928
- package/cpp/whisper.h +55 -10
- package/ios/RNWhisper.mm +7 -7
- package/ios/RNWhisperAudioUtils.h +0 -2
- package/ios/RNWhisperAudioUtils.m +0 -56
- package/ios/RNWhisperContext.h +3 -11
- package/ios/RNWhisperContext.mm +68 -137
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/index.d.ts +5 -0
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +6 -5
- package/src/index.ts +5 -0
- package/src/version.json +1 -1
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
- package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
package/cpp/whisper.h
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#ifndef WHISPER_H
|
|
2
2
|
#define WHISPER_H
|
|
3
3
|
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
|
|
4
6
|
#include <stddef.h>
|
|
5
7
|
#include <stdint.h>
|
|
6
8
|
#include <stdbool.h>
|
|
@@ -48,7 +50,9 @@ extern "C" {
|
|
|
48
50
|
//
|
|
49
51
|
// ...
|
|
50
52
|
//
|
|
51
|
-
//
|
|
53
|
+
// whisper_context_params cparams = whisper_context_default_params();
|
|
54
|
+
//
|
|
55
|
+
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
|
52
56
|
//
|
|
53
57
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
54
58
|
// fprintf(stderr, "failed to process audio\n");
|
|
@@ -76,7 +80,9 @@ extern "C" {
|
|
|
76
80
|
struct whisper_state;
|
|
77
81
|
struct whisper_full_params;
|
|
78
82
|
|
|
79
|
-
typedef
|
|
83
|
+
typedef int32_t whisper_pos;
|
|
84
|
+
typedef int32_t whisper_token;
|
|
85
|
+
typedef int32_t whisper_seq_id;
|
|
80
86
|
|
|
81
87
|
struct whisper_context_params {
|
|
82
88
|
bool use_gpu;
|
|
@@ -108,18 +114,49 @@ extern "C" {
|
|
|
108
114
|
void (*close)(void * ctx);
|
|
109
115
|
} whisper_model_loader;
|
|
110
116
|
|
|
117
|
+
// grammar element type
|
|
118
|
+
enum whisper_gretype {
|
|
119
|
+
// end of rule definition
|
|
120
|
+
WHISPER_GRETYPE_END = 0,
|
|
121
|
+
|
|
122
|
+
// start of alternate definition for rule
|
|
123
|
+
WHISPER_GRETYPE_ALT = 1,
|
|
124
|
+
|
|
125
|
+
// non-terminal element: reference to rule
|
|
126
|
+
WHISPER_GRETYPE_RULE_REF = 2,
|
|
127
|
+
|
|
128
|
+
// terminal element: character (code point)
|
|
129
|
+
WHISPER_GRETYPE_CHAR = 3,
|
|
130
|
+
|
|
131
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
|
132
|
+
WHISPER_GRETYPE_CHAR_NOT = 4,
|
|
133
|
+
|
|
134
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
|
135
|
+
// be an inclusive range ([a-z])
|
|
136
|
+
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
|
|
137
|
+
|
|
138
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or
|
|
139
|
+
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
|
140
|
+
WHISPER_GRETYPE_CHAR_ALT = 6,
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
typedef struct whisper_grammar_element {
|
|
144
|
+
enum whisper_gretype type;
|
|
145
|
+
uint32_t value; // Unicode code point or rule ID
|
|
146
|
+
} whisper_grammar_element;
|
|
147
|
+
|
|
111
148
|
// Various functions for loading a ggml whisper model.
|
|
112
149
|
// Allocate (almost) all memory needed for the model.
|
|
113
150
|
// Return NULL on failure
|
|
114
|
-
WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model,
|
|
115
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,
|
|
116
|
-
WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
151
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
|
|
152
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
153
|
+
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
117
154
|
|
|
118
155
|
// These are the same as the above, but the internal state of the context is not allocated automatically
|
|
119
156
|
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
|
120
|
-
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model,
|
|
121
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,
|
|
122
|
-
WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
157
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
|
|
158
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
159
|
+
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
123
160
|
|
|
124
161
|
WHISPER_DEPRECATED(
|
|
125
162
|
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
|
|
@@ -279,6 +316,9 @@ extern "C" {
|
|
|
279
316
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
|
280
317
|
WHISPER_API const char * whisper_lang_str(int id);
|
|
281
318
|
|
|
319
|
+
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
|
|
320
|
+
WHISPER_API const char * whisper_lang_str_full(int id);
|
|
321
|
+
|
|
282
322
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
|
283
323
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
|
284
324
|
// Returns the top language id or negative on failure
|
|
@@ -401,6 +441,7 @@ extern "C" {
|
|
|
401
441
|
|
|
402
442
|
bool translate;
|
|
403
443
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
|
444
|
+
bool no_timestamps; // do not generate timestamps
|
|
404
445
|
bool single_segment; // force single segment output (useful for streaming)
|
|
405
446
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
|
406
447
|
bool print_progress; // print progress information
|
|
@@ -478,6 +519,11 @@ extern "C" {
|
|
|
478
519
|
// called by each decoder to filter obtained logits
|
|
479
520
|
whisper_logits_filter_callback logits_filter_callback;
|
|
480
521
|
void * logits_filter_callback_user_data;
|
|
522
|
+
|
|
523
|
+
const whisper_grammar_element ** grammar_rules;
|
|
524
|
+
size_t n_grammar_rules;
|
|
525
|
+
size_t i_start_rule;
|
|
526
|
+
float grammar_penalty;
|
|
481
527
|
};
|
|
482
528
|
|
|
483
529
|
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
|
@@ -571,8 +617,7 @@ extern "C" {
|
|
|
571
617
|
|
|
572
618
|
// Control logging output; default behavior is to print to stderr
|
|
573
619
|
|
|
574
|
-
|
|
575
|
-
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
|
620
|
+
WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
|
|
576
621
|
|
|
577
622
|
#ifdef __cplusplus
|
|
578
623
|
}
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -142,9 +142,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
142
142
|
audioDataCount:count
|
|
143
143
|
options:options
|
|
144
144
|
onProgress: ^(int progress) {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
146
|
+
if (job && job->is_aborted()) return;
|
|
147
|
+
|
|
148
148
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
149
149
|
[self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
|
|
150
150
|
body:@{
|
|
@@ -156,9 +156,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
156
156
|
});
|
|
157
157
|
}
|
|
158
158
|
onNewSegments: ^(NSDictionary *result) {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
160
|
+
if (job && job->is_aborted()) return;
|
|
161
|
+
|
|
162
162
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
163
163
|
[self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
|
|
164
164
|
body:@{
|
|
@@ -279,7 +279,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
279
279
|
[context invalidate];
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
-
|
|
282
|
+
rnwhisper::job_abort_all(); // graceful abort
|
|
283
283
|
|
|
284
284
|
[contexts removeAllObjects];
|
|
285
285
|
contexts = nil;
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
@interface RNWhisperAudioUtils : NSObject
|
|
4
4
|
|
|
5
|
-
+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
|
|
6
|
-
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
|
|
7
5
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
|
|
8
6
|
|
|
9
7
|
@end
|
|
@@ -3,62 +3,6 @@
|
|
|
3
3
|
|
|
4
4
|
@implementation RNWhisperAudioUtils
|
|
5
5
|
|
|
6
|
-
+ (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples {
|
|
7
|
-
NSMutableData *outputData = [NSMutableData data];
|
|
8
|
-
for (int i = 0; i < buffers.count; i++) {
|
|
9
|
-
int size = [sliceNSamples objectAtIndex:i].intValue;
|
|
10
|
-
NSValue *buffer = [buffers objectAtIndex:i];
|
|
11
|
-
short *bufferPtr = buffer.pointerValue;
|
|
12
|
-
[outputData appendBytes:bufferPtr length:size * sizeof(short)];
|
|
13
|
-
}
|
|
14
|
-
return outputData;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile {
|
|
18
|
-
NSMutableData *outputData = [NSMutableData data];
|
|
19
|
-
|
|
20
|
-
// WAVE header
|
|
21
|
-
[outputData appendData:[@"RIFF" dataUsingEncoding:NSUTF8StringEncoding]]; // chunk id
|
|
22
|
-
int chunkSize = CFSwapInt32HostToLittle(36 + rawData.length);
|
|
23
|
-
[outputData appendBytes:&chunkSize length:sizeof(chunkSize)];
|
|
24
|
-
[outputData appendData:[@"WAVE" dataUsingEncoding:NSUTF8StringEncoding]]; // format
|
|
25
|
-
[outputData appendData:[@"fmt " dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 1 id
|
|
26
|
-
|
|
27
|
-
int subchunk1Size = CFSwapInt32HostToLittle(16);
|
|
28
|
-
[outputData appendBytes:&subchunk1Size length:sizeof(subchunk1Size)];
|
|
29
|
-
|
|
30
|
-
short audioFormat = CFSwapInt16HostToLittle(1); // PCM
|
|
31
|
-
[outputData appendBytes:&audioFormat length:sizeof(audioFormat)];
|
|
32
|
-
|
|
33
|
-
short numChannels = CFSwapInt16HostToLittle(1); // mono
|
|
34
|
-
[outputData appendBytes:&numChannels length:sizeof(numChannels)];
|
|
35
|
-
|
|
36
|
-
int sampleRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE);
|
|
37
|
-
[outputData appendBytes:&sampleRate length:sizeof(sampleRate)];
|
|
38
|
-
|
|
39
|
-
// (bitDepth * sampleRate * channels) >> 3
|
|
40
|
-
int byteRate = CFSwapInt32HostToLittle(WHISPER_SAMPLE_RATE * 1 * 16 / 8);
|
|
41
|
-
[outputData appendBytes:&byteRate length:sizeof(byteRate)];
|
|
42
|
-
|
|
43
|
-
// (bitDepth * channels) >> 3
|
|
44
|
-
short blockAlign = CFSwapInt16HostToLittle(16 / 8);
|
|
45
|
-
[outputData appendBytes:&blockAlign length:sizeof(blockAlign)];
|
|
46
|
-
|
|
47
|
-
// bitDepth
|
|
48
|
-
short bitsPerSample = CFSwapInt16HostToLittle(16);
|
|
49
|
-
[outputData appendBytes:&bitsPerSample length:sizeof(bitsPerSample)];
|
|
50
|
-
|
|
51
|
-
[outputData appendData:[@"data" dataUsingEncoding:NSUTF8StringEncoding]]; // subchunk 2 id
|
|
52
|
-
int subchunk2Size = CFSwapInt32HostToLittle((int)rawData.length);
|
|
53
|
-
[outputData appendBytes:&subchunk2Size length:sizeof(subchunk2Size)];
|
|
54
|
-
|
|
55
|
-
// Audio data
|
|
56
|
-
[outputData appendData:rawData];
|
|
57
|
-
|
|
58
|
-
// Save to file
|
|
59
|
-
[outputData writeToFile:audioOutputFile atomically:YES];
|
|
60
|
-
}
|
|
61
|
-
|
|
62
6
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count {
|
|
63
7
|
NSURL *url = [NSURL fileURLWithPath:filePath];
|
|
64
8
|
NSData *fileData = [NSData dataWithContentsOfURL:url];
|
package/ios/RNWhisperContext.h
CHANGED
|
@@ -11,29 +11,21 @@
|
|
|
11
11
|
|
|
12
12
|
typedef struct {
|
|
13
13
|
__unsafe_unretained id mSelf;
|
|
14
|
-
|
|
15
|
-
int jobId;
|
|
16
14
|
NSDictionary* options;
|
|
17
15
|
|
|
16
|
+
struct rnwhisper::job * job;
|
|
17
|
+
|
|
18
18
|
bool isTranscribing;
|
|
19
19
|
bool isRealtime;
|
|
20
20
|
bool isCapturing;
|
|
21
21
|
bool isStoppedByAction;
|
|
22
|
-
int maxAudioSec;
|
|
23
22
|
int nSamplesTranscribing;
|
|
24
|
-
|
|
25
|
-
NSMutableArray<NSNumber *> *sliceNSamples;
|
|
23
|
+
std::vector<int> sliceNSamples;
|
|
26
24
|
bool isUseSlices;
|
|
27
25
|
int sliceIndex;
|
|
28
26
|
int transcribeSliceIndex;
|
|
29
|
-
int audioSliceSec;
|
|
30
27
|
NSString* audioOutputPath;
|
|
31
28
|
|
|
32
|
-
bool useVad;
|
|
33
|
-
int vadMs;
|
|
34
|
-
float vadThold;
|
|
35
|
-
float vadFreqThold;
|
|
36
|
-
|
|
37
29
|
AudioQueueRef queue;
|
|
38
30
|
AudioStreamBasicDescription dataFormat;
|
|
39
31
|
AudioQueueBufferRef buffers[NUM_BUFFERS];
|
package/ios/RNWhisperContext.mm
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#import "RNWhisperContext.h"
|
|
2
|
-
#import "RNWhisperAudioUtils.h"
|
|
3
2
|
#import <Metal/Metal.h>
|
|
4
3
|
#include <vector>
|
|
5
4
|
|
|
@@ -95,7 +94,7 @@
|
|
|
95
94
|
return self->dQueue;
|
|
96
95
|
}
|
|
97
96
|
|
|
98
|
-
- (void)prepareRealtime:(NSDictionary *)options {
|
|
97
|
+
- (void)prepareRealtime:(int)jobId options:(NSDictionary *)options {
|
|
99
98
|
self->recordState.options = options;
|
|
100
99
|
|
|
101
100
|
self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
|
|
@@ -108,74 +107,39 @@
|
|
|
108
107
|
self->recordState.dataFormat.mReserved = 0;
|
|
109
108
|
self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
110
109
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self->recordState.
|
|
114
|
-
|
|
115
|
-
int realtimeAudioSliceSec = options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0;
|
|
116
|
-
int audioSliceSec = realtimeAudioSliceSec > 0 && realtimeAudioSliceSec < maxAudioSec ? realtimeAudioSliceSec : maxAudioSec;
|
|
117
|
-
|
|
118
|
-
self->recordState.audioOutputPath = options[@"audioOutputPath"];
|
|
119
|
-
|
|
120
|
-
self->recordState.useVad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false;
|
|
121
|
-
self->recordState.vadMs = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000;
|
|
122
|
-
if (self->recordState.vadMs < 2000) self->recordState.vadMs = 2000;
|
|
123
|
-
|
|
124
|
-
self->recordState.vadThold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f;
|
|
125
|
-
self->recordState.vadFreqThold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f;
|
|
126
|
-
|
|
127
|
-
self->recordState.audioSliceSec = audioSliceSec;
|
|
128
|
-
self->recordState.isUseSlices = audioSliceSec < maxAudioSec;
|
|
110
|
+
self->recordState.isRealtime = true;
|
|
111
|
+
self->recordState.isTranscribing = false;
|
|
112
|
+
self->recordState.isCapturing = false;
|
|
113
|
+
self->recordState.isStoppedByAction = false;
|
|
129
114
|
|
|
130
115
|
self->recordState.sliceIndex = 0;
|
|
131
116
|
self->recordState.transcribeSliceIndex = 0;
|
|
132
117
|
self->recordState.nSamplesTranscribing = 0;
|
|
133
118
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
119
|
+
self->recordState.sliceNSamples.push_back(0);
|
|
120
|
+
|
|
121
|
+
self->recordState.job = rnwhisper::job_new(jobId, [self createParams:options jobId:jobId]);
|
|
122
|
+
self->recordState.job->set_realtime_params(
|
|
123
|
+
{
|
|
124
|
+
.use_vad = options[@"useVad"] != nil ? [options[@"useVad"] boolValue] : false,
|
|
125
|
+
.vad_ms = options[@"vadMs"] != nil ? [options[@"vadMs"] intValue] : 2000,
|
|
126
|
+
.vad_thold = options[@"vadThold"] != nil ? [options[@"vadThold"] floatValue] : 0.6f,
|
|
127
|
+
.freq_thold = options[@"vadFreqThold"] != nil ? [options[@"vadFreqThold"] floatValue] : 100.0f
|
|
128
|
+
},
|
|
129
|
+
options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0,
|
|
130
|
+
options[@"realtimeAudioSliceSec"] != nil ? [options[@"realtimeAudioSliceSec"] intValue] : 0,
|
|
131
|
+
options[@"realtimeAudioMinSec"] != nil ? [options[@"realtimeAudioMinSec"] floatValue] : 0,
|
|
132
|
+
options[@"audioOutputPath"] != nil ? [options[@"audioOutputPath"] UTF8String] : nullptr
|
|
133
|
+
);
|
|
134
|
+
self->recordState.isUseSlices = self->recordState.job->audio_slice_sec < self->recordState.job->audio_sec;
|
|
147
135
|
|
|
148
136
|
self->recordState.mSelf = self;
|
|
149
137
|
}
|
|
150
138
|
|
|
151
|
-
|
|
152
|
-
if (self->recordState.shortBufferSlices != nil) {
|
|
153
|
-
for (int i = 0; i < [self->recordState.shortBufferSlices count]; i++) {
|
|
154
|
-
int16_t *audioBufferI16 = (int16_t *) [self->recordState.shortBufferSlices[i] pointerValue];
|
|
155
|
-
free(audioBufferI16);
|
|
156
|
-
}
|
|
157
|
-
self->recordState.shortBufferSlices = nil;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
bool vad(RNWhisperContextRecordState *state, int16_t* audioBufferI16, int nSamples, int n)
|
|
139
|
+
bool vad(RNWhisperContextRecordState *state, int sliceIndex, int nSamples, int n)
|
|
162
140
|
{
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
int sampleSize = (int) (WHISPER_SAMPLE_RATE * state->vadMs / 1000);
|
|
166
|
-
if (nSamples + n > sampleSize) {
|
|
167
|
-
int start = nSamples + n - sampleSize;
|
|
168
|
-
std::vector<float> audioBufferF32Vec(sampleSize);
|
|
169
|
-
for (int i = 0; i < sampleSize; i++) {
|
|
170
|
-
audioBufferF32Vec[i] = (float)audioBufferI16[i + start] / 32768.0f;
|
|
171
|
-
}
|
|
172
|
-
isSpeech = rn_whisper_vad_simple(audioBufferF32Vec, WHISPER_SAMPLE_RATE, 1000, state->vadThold, state->vadFreqThold, false);
|
|
173
|
-
NSLog(@"[RNWhisper] VAD result: %d", isSpeech);
|
|
174
|
-
} else {
|
|
175
|
-
isSpeech = false;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
return isSpeech;
|
|
141
|
+
if (state->isTranscribing) return true;
|
|
142
|
+
return state->job->vad_simple(sliceIndex, nSamples, n);
|
|
179
143
|
}
|
|
180
144
|
|
|
181
145
|
void AudioInputCallback(void * inUserData,
|
|
@@ -196,15 +160,15 @@ void AudioInputCallback(void * inUserData,
|
|
|
196
160
|
}
|
|
197
161
|
|
|
198
162
|
int totalNSamples = 0;
|
|
199
|
-
for (int i = 0; i <
|
|
200
|
-
totalNSamples +=
|
|
163
|
+
for (int i = 0; i < state->sliceNSamples.size(); i++) {
|
|
164
|
+
totalNSamples += state->sliceNSamples[i];
|
|
201
165
|
}
|
|
202
166
|
|
|
203
167
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
204
168
|
|
|
205
|
-
int nSamples =
|
|
169
|
+
int nSamples = state->sliceNSamples[state->sliceIndex];
|
|
206
170
|
|
|
207
|
-
if (totalNSamples + n > state->
|
|
171
|
+
if (totalNSamples + n > state->job->audio_sec * WHISPER_SAMPLE_RATE) {
|
|
208
172
|
NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
|
|
209
173
|
state->isCapturing = false;
|
|
210
174
|
[state->mSelf stopAudio];
|
|
@@ -218,8 +182,8 @@ void AudioInputCallback(void * inUserData,
|
|
|
218
182
|
!state->isTranscribing &&
|
|
219
183
|
nSamples != state->nSamplesTranscribing
|
|
220
184
|
) {
|
|
221
|
-
|
|
222
|
-
if (!vad(state,
|
|
185
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
186
|
+
if (!isSamplesEnough || !vad(state, state->sliceIndex, nSamples, 0)) {
|
|
223
187
|
[state->mSelf finishRealtimeTranscribe:state result:@{}];
|
|
224
188
|
return;
|
|
225
189
|
}
|
|
@@ -231,31 +195,25 @@ void AudioInputCallback(void * inUserData,
|
|
|
231
195
|
return;
|
|
232
196
|
}
|
|
233
197
|
|
|
234
|
-
|
|
235
|
-
if (nSamples + n > audioSliceSec * WHISPER_SAMPLE_RATE) {
|
|
198
|
+
if (nSamples + n > state->job->audio_slice_sec * WHISPER_SAMPLE_RATE) {
|
|
236
199
|
// next slice
|
|
237
200
|
state->sliceIndex++;
|
|
238
201
|
nSamples = 0;
|
|
239
|
-
|
|
240
|
-
[state->shortBufferSlices addObject:[NSValue valueWithPointer:audioBufferI16]];
|
|
241
|
-
[state->sliceNSamples addObject:[NSNumber numberWithInt:0]];
|
|
202
|
+
state->sliceNSamples.push_back(0);
|
|
242
203
|
}
|
|
243
204
|
|
|
244
|
-
// Append to buffer
|
|
245
205
|
NSLog(@"[RNWhisper] Slice %d has %d samples", state->sliceIndex, nSamples);
|
|
246
206
|
|
|
247
|
-
|
|
248
|
-
for (int i = 0; i < n; i++) {
|
|
249
|
-
audioBufferI16[nSamples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
250
|
-
}
|
|
207
|
+
state->job->put_pcm_data((short*) inBuffer->mAudioData, state->sliceIndex, nSamples, n);
|
|
251
208
|
|
|
252
|
-
bool isSpeech = vad(state,
|
|
209
|
+
bool isSpeech = vad(state, state->sliceIndex, nSamples, n);
|
|
253
210
|
nSamples += n;
|
|
254
|
-
state->sliceNSamples[state->sliceIndex] =
|
|
211
|
+
state->sliceNSamples[state->sliceIndex] = nSamples;
|
|
255
212
|
|
|
256
213
|
AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
|
|
257
214
|
|
|
258
|
-
|
|
215
|
+
bool isSamplesEnough = nSamples / WHISPER_SAMPLE_RATE >= state->job->audio_min_sec;
|
|
216
|
+
if (!isSamplesEnough || !isSpeech) return;
|
|
259
217
|
|
|
260
218
|
if (!state->isTranscribing) {
|
|
261
219
|
state->isTranscribing = true;
|
|
@@ -267,32 +225,27 @@ void AudioInputCallback(void * inUserData,
|
|
|
267
225
|
|
|
268
226
|
- (void)finishRealtimeTranscribe:(RNWhisperContextRecordState*) state result:(NSDictionary*)result {
|
|
269
227
|
// Save wav if needed
|
|
270
|
-
if (state->
|
|
228
|
+
if (state->job->audio_output_path != nullptr) {
|
|
271
229
|
// TODO: Append in real time so we don't need to keep all slices & also reduce memory usage
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
];
|
|
230
|
+
rnaudioutils::save_wav_file(
|
|
231
|
+
rnaudioutils::concat_short_buffers(state->job->pcm_slices, state->sliceNSamples),
|
|
232
|
+
state->job->audio_output_path
|
|
233
|
+
);
|
|
277
234
|
}
|
|
278
|
-
state->transcribeHandler(state->
|
|
235
|
+
state->transcribeHandler(state->job->job_id, @"end", result);
|
|
236
|
+
rnwhisper::job_remove(state->job->job_id);
|
|
279
237
|
}
|
|
280
238
|
|
|
281
239
|
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
282
|
-
int nSamplesOfIndex =
|
|
240
|
+
int nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
283
241
|
state->nSamplesTranscribing = nSamplesOfIndex;
|
|
284
242
|
NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
|
|
285
243
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
// convert I16 to F32
|
|
289
|
-
for (int i = 0; i < state->nSamplesTranscribing; i++) {
|
|
290
|
-
audioBufferF32[i] = (float)audioBufferI16[i] / 32768.0f;
|
|
291
|
-
}
|
|
244
|
+
float* pcmf32 = state->job->pcm_slice_to_f32(state->transcribeSliceIndex, state->nSamplesTranscribing);
|
|
245
|
+
|
|
292
246
|
CFTimeInterval timeStart = CACurrentMediaTime();
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
free(audioBufferF32);
|
|
247
|
+
int code = [state->mSelf fullTranscribe:state->job audioData:pcmf32 audioDataCount:state->nSamplesTranscribing];
|
|
248
|
+
free(pcmf32);
|
|
296
249
|
CFTimeInterval timeEnd = CACurrentMediaTime();
|
|
297
250
|
const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
|
|
298
251
|
|
|
@@ -312,7 +265,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
312
265
|
result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
|
|
313
266
|
}
|
|
314
267
|
|
|
315
|
-
nSamplesOfIndex =
|
|
268
|
+
nSamplesOfIndex = state->sliceNSamples[state->transcribeSliceIndex];
|
|
316
269
|
|
|
317
270
|
bool isStopped = state->isStoppedByAction || (
|
|
318
271
|
!state->isCapturing &&
|
|
@@ -340,10 +293,10 @@ void AudioInputCallback(void * inUserData,
|
|
|
340
293
|
[state->mSelf finishRealtimeTranscribe:state result:result];
|
|
341
294
|
} else if (code == 0) {
|
|
342
295
|
result[@"isCapturing"] = @(true);
|
|
343
|
-
state->transcribeHandler(state->
|
|
296
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
344
297
|
} else {
|
|
345
298
|
result[@"isCapturing"] = @(true);
|
|
346
|
-
state->transcribeHandler(state->
|
|
299
|
+
state->transcribeHandler(state->job->job_id, @"transcribe", result);
|
|
347
300
|
}
|
|
348
301
|
|
|
349
302
|
if (continueNeeded) {
|
|
@@ -371,8 +324,7 @@ void AudioInputCallback(void * inUserData,
|
|
|
371
324
|
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
|
|
372
325
|
{
|
|
373
326
|
self->recordState.transcribeHandler = onTranscribe;
|
|
374
|
-
self
|
|
375
|
-
[self prepareRealtime:options];
|
|
327
|
+
[self prepareRealtime:jobId options:options];
|
|
376
328
|
|
|
377
329
|
OSStatus status = AudioQueueNewInput(
|
|
378
330
|
&self->recordState.dataFormat,
|
|
@@ -413,9 +365,9 @@ struct rnwhisper_segments_callback_data {
|
|
|
413
365
|
dispatch_async(dQueue, ^{
|
|
414
366
|
self->recordState.isStoppedByAction = false;
|
|
415
367
|
self->recordState.isTranscribing = true;
|
|
416
|
-
self->recordState.jobId = jobId;
|
|
417
368
|
|
|
418
|
-
whisper_full_params params = [self
|
|
369
|
+
whisper_full_params params = [self createParams:options jobId:jobId];
|
|
370
|
+
|
|
419
371
|
if (options[@"onProgress"] && [options[@"onProgress"] boolValue]) {
|
|
420
372
|
params.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
|
421
373
|
void (^onProgress)(int) = (__bridge void (^)(int))user_data;
|
|
@@ -460,8 +412,10 @@ struct rnwhisper_segments_callback_data {
|
|
|
460
412
|
};
|
|
461
413
|
params.new_segment_callback_user_data = &user_data;
|
|
462
414
|
}
|
|
463
|
-
|
|
464
|
-
|
|
415
|
+
|
|
416
|
+
rnwhisper::job* job = rnwhisper::job_new(jobId, params);;
|
|
417
|
+
int code = [self fullTranscribe:job audioData:audioData audioDataCount:audioDataCount];
|
|
418
|
+
rnwhisper::job_remove(jobId);
|
|
465
419
|
self->recordState.isTranscribing = false;
|
|
466
420
|
onEnd(code);
|
|
467
421
|
});
|
|
@@ -476,7 +430,7 @@ struct rnwhisper_segments_callback_data {
|
|
|
476
430
|
}
|
|
477
431
|
|
|
478
432
|
- (void)stopTranscribe:(int)jobId {
|
|
479
|
-
|
|
433
|
+
if (self->recordState.job) self->recordState.job->abort();
|
|
480
434
|
if (self->recordState.isRealtime && self->recordState.isCapturing) {
|
|
481
435
|
[self stopAudio];
|
|
482
436
|
if (!self->recordState.isTranscribing) {
|
|
@@ -490,13 +444,11 @@ struct rnwhisper_segments_callback_data {
|
|
|
490
444
|
}
|
|
491
445
|
|
|
492
446
|
- (void)stopCurrentTranscribe {
|
|
493
|
-
if (
|
|
494
|
-
|
|
495
|
-
}
|
|
496
|
-
[self stopTranscribe:self->recordState.jobId];
|
|
447
|
+
if (self->recordState.job == nullptr) return;
|
|
448
|
+
[self stopTranscribe:self->recordState.job->job_id];
|
|
497
449
|
}
|
|
498
450
|
|
|
499
|
-
- (struct whisper_full_params)
|
|
451
|
+
- (struct whisper_full_params)createParams:(NSDictionary *)options jobId:(int)jobId {
|
|
500
452
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
501
453
|
|
|
502
454
|
const int n_threads = options[@"maxThreads"] != nil ?
|
|
@@ -517,7 +469,7 @@ struct rnwhisper_segments_callback_data {
|
|
|
517
469
|
params.print_special = false;
|
|
518
470
|
params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
|
|
519
471
|
params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
|
|
520
|
-
params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
|
|
472
|
+
params.language = options[@"language"] != nil ? strdup([options[@"language"] UTF8String]) : "auto";
|
|
521
473
|
params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
|
522
474
|
params.offset_ms = 0;
|
|
523
475
|
params.no_context = true;
|
|
@@ -534,7 +486,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
534
486
|
if (options[@"maxContext"] != nil) {
|
|
535
487
|
params.n_max_text_ctx = [options[@"maxContext"] intValue];
|
|
536
488
|
}
|
|
537
|
-
|
|
538
489
|
if (options[@"offset"] != nil) {
|
|
539
490
|
params.offset_ms = [options[@"offset"] intValue];
|
|
540
491
|
}
|
|
@@ -550,39 +501,20 @@ struct rnwhisper_segments_callback_data {
|
|
|
550
501
|
if (options[@"temperatureInc"] != nil) {
|
|
551
502
|
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
552
503
|
}
|
|
553
|
-
|
|
554
504
|
if (options[@"prompt"] != nil) {
|
|
555
|
-
params.initial_prompt = [options[@"prompt"] UTF8String];
|
|
505
|
+
params.initial_prompt = strdup([options[@"prompt"] UTF8String]);
|
|
556
506
|
}
|
|
557
507
|
|
|
558
|
-
// abort handler
|
|
559
|
-
bool *abort_ptr = rn_whisper_assign_abort_map(jobId);
|
|
560
|
-
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
561
|
-
bool is_aborted = *(bool*)user_data;
|
|
562
|
-
return !is_aborted;
|
|
563
|
-
};
|
|
564
|
-
params.encoder_begin_callback_user_data = abort_ptr;
|
|
565
|
-
params.abort_callback = [](void * user_data) {
|
|
566
|
-
bool is_aborted = *(bool*)user_data;
|
|
567
|
-
return is_aborted;
|
|
568
|
-
};
|
|
569
|
-
params.abort_callback_user_data = abort_ptr;
|
|
570
|
-
|
|
571
508
|
return params;
|
|
572
509
|
}
|
|
573
510
|
|
|
574
|
-
- (int)fullTranscribe:(
|
|
575
|
-
params:(struct whisper_full_params)params
|
|
511
|
+
- (int)fullTranscribe:(rnwhisper::job *)job
|
|
576
512
|
audioData:(float *)audioData
|
|
577
513
|
audioDataCount:(int)audioDataCount
|
|
578
514
|
{
|
|
579
515
|
whisper_reset_timings(self->ctx);
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
if (rn_whisper_transcribe_is_aborted(jobId)) {
|
|
583
|
-
code = -999;
|
|
584
|
-
}
|
|
585
|
-
rn_whisper_remove_abort_map(jobId);
|
|
516
|
+
int code = whisper_full(self->ctx, job->params, audioData, audioDataCount);
|
|
517
|
+
if (job && job->is_aborted()) code = -999;
|
|
586
518
|
// if (code == 0) {
|
|
587
519
|
// whisper_print_timings(self->ctx);
|
|
588
520
|
// }
|
|
@@ -616,7 +548,6 @@ struct rnwhisper_segments_callback_data {
|
|
|
616
548
|
- (void)invalidate {
|
|
617
549
|
[self stopCurrentTranscribe];
|
|
618
550
|
whisper_free(self->ctx);
|
|
619
|
-
[self freeBufferIfNeeded];
|
|
620
551
|
}
|
|
621
552
|
|
|
622
553
|
@end
|