whisper.rn 0.1.5 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -4
- package/android/build.gradle +2 -4
- package/android/src/main/java/com/rnwhisper/RNWhisperModule.java +47 -7
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +224 -7
- package/android/src/main/jni/whisper/Whisper.mk +1 -1
- package/android/src/main/jni/whisper/jni.cpp +34 -5
- package/cpp/rn-whisper.cpp +26 -0
- package/cpp/rn-whisper.h +5 -0
- package/ios/RNWhisper.h +2 -2
- package/ios/RNWhisper.mm +78 -111
- package/ios/RNWhisperContext.h +55 -0
- package/ios/RNWhisperContext.mm +326 -0
- package/jest/mock.js +43 -2
- package/lib/commonjs/index.js +59 -2
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +60 -3
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/index.d.ts +63 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/index.tsx +124 -4
package/ios/RNWhisper.mm
CHANGED
|
@@ -1,23 +1,8 @@
|
|
|
1
|
-
|
|
2
1
|
#import "RNWhisper.h"
|
|
2
|
+
#import "RNWhisperContext.h"
|
|
3
3
|
#include <stdlib.h>
|
|
4
4
|
#include <string>
|
|
5
5
|
|
|
6
|
-
@interface WhisperContext : NSObject {
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
@property struct whisper_context * ctx;
|
|
10
|
-
|
|
11
|
-
@end
|
|
12
|
-
|
|
13
|
-
@implementation WhisperContext
|
|
14
|
-
|
|
15
|
-
- (void)invalidate {
|
|
16
|
-
whisper_free(self.ctx);
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
@end
|
|
20
|
-
|
|
21
6
|
@implementation RNWhisper
|
|
22
7
|
|
|
23
8
|
NSMutableDictionary *contexts;
|
|
@@ -33,10 +18,8 @@ RCT_REMAP_METHOD(initContext,
|
|
|
33
18
|
contexts = [[NSMutableDictionary alloc] init];
|
|
34
19
|
}
|
|
35
20
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
if (context.ctx == NULL) {
|
|
21
|
+
RNWhisperContext *context = [RNWhisperContext initWithModelPath:modelPath];
|
|
22
|
+
if ([context getContext] == NULL) {
|
|
40
23
|
reject(@"whisper_cpp_error", @"Failed to load the model", nil);
|
|
41
24
|
return;
|
|
42
25
|
}
|
|
@@ -47,123 +30,105 @@ RCT_REMAP_METHOD(initContext,
|
|
|
47
30
|
resolve([NSNumber numberWithInt:contextId]);
|
|
48
31
|
}
|
|
49
32
|
|
|
50
|
-
RCT_REMAP_METHOD(
|
|
33
|
+
RCT_REMAP_METHOD(transcribeFile,
|
|
51
34
|
withContextId:(int)contextId
|
|
35
|
+
withJobId:(int)jobId
|
|
52
36
|
withWaveFile:(NSString *)waveFilePath
|
|
53
37
|
withOptions:(NSDictionary *)options
|
|
54
38
|
withResolver:(RCTPromiseResolveBlock)resolve
|
|
55
39
|
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
56
40
|
{
|
|
57
|
-
|
|
41
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
58
42
|
|
|
59
43
|
if (context == nil) {
|
|
60
44
|
reject(@"whisper_error", @"Context not found", nil);
|
|
61
45
|
return;
|
|
62
46
|
}
|
|
47
|
+
if ([context isCapturing]) {
|
|
48
|
+
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
if ([context isTranscribing]) {
|
|
52
|
+
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
63
55
|
|
|
64
56
|
NSURL *url = [NSURL fileURLWithPath:waveFilePath];
|
|
65
57
|
|
|
66
58
|
int count = 0;
|
|
67
59
|
float *waveFile = [self decodeWaveFile:url count:&count];
|
|
68
|
-
|
|
69
60
|
if (waveFile == nil) {
|
|
70
61
|
reject(@"whisper_error", @"Invalid file", nil);
|
|
71
62
|
return;
|
|
72
63
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
MIN(4, (int)[[NSProcessInfo processInfo] processorCount]);
|
|
79
|
-
|
|
80
|
-
if (options[@"beamSize"] != nil) {
|
|
81
|
-
params.strategy = WHISPER_SAMPLING_BEAM_SEARCH;
|
|
82
|
-
params.beam_search.beam_size = [options[@"beamSize"] intValue];
|
|
64
|
+
int code = [context transcribeFile:jobId audioData:waveFile audioDataCount:count options:options];
|
|
65
|
+
if (code != 0) {
|
|
66
|
+
free(waveFile);
|
|
67
|
+
reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
|
|
68
|
+
return;
|
|
83
69
|
}
|
|
70
|
+
free(waveFile);
|
|
71
|
+
resolve([context getTextSegments]);
|
|
72
|
+
}
|
|
84
73
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
|
|
92
|
-
params.n_threads = max_threads;
|
|
93
|
-
params.offset_ms = 0;
|
|
94
|
-
params.no_context = true;
|
|
95
|
-
params.single_segment = false;
|
|
96
|
-
|
|
97
|
-
if (options[@"maxLen"] != nil) {
|
|
98
|
-
params.max_len = [options[@"maxLen"] intValue];
|
|
99
|
-
}
|
|
100
|
-
params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
|
|
74
|
+
- (NSArray *)supportedEvents {
|
|
75
|
+
return@[
|
|
76
|
+
@"@RNWhisper_onRealtimeTranscribe",
|
|
77
|
+
@"@RNWhisper_onRealtimeTranscribeEnd",
|
|
78
|
+
];
|
|
79
|
+
}
|
|
101
80
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
params.offset_ms = [options[@"offset"] intValue];
|
|
111
|
-
}
|
|
112
|
-
if (options[@"duration"] != nil) {
|
|
113
|
-
params.duration_ms = [options[@"duration"] intValue];
|
|
114
|
-
}
|
|
115
|
-
if (options[@"wordThold"] != nil) {
|
|
116
|
-
params.thold_pt = [options[@"wordThold"] intValue];
|
|
117
|
-
}
|
|
118
|
-
if (options[@"temperature"] != nil) {
|
|
119
|
-
params.temperature = [options[@"temperature"] floatValue];
|
|
120
|
-
}
|
|
121
|
-
if (options[@"temperatureInc"] != nil) {
|
|
122
|
-
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
if (options[@"prompt"] != nil) {
|
|
126
|
-
std::string *prompt = new std::string([options[@"prompt"] UTF8String]);
|
|
127
|
-
rn_whisper_convert_prompt(
|
|
128
|
-
context.ctx,
|
|
129
|
-
params,
|
|
130
|
-
prompt
|
|
131
|
-
);
|
|
132
|
-
}
|
|
81
|
+
RCT_REMAP_METHOD(startRealtimeTranscribe,
|
|
82
|
+
withContextId:(int)contextId
|
|
83
|
+
withJobId:(int)jobId
|
|
84
|
+
withOptions:(NSDictionary *)options
|
|
85
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
86
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
87
|
+
{
|
|
88
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
133
89
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
reject(@"
|
|
90
|
+
if (context == nil) {
|
|
91
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
if ([context isCapturing]) {
|
|
95
|
+
reject(@"whisper_error", @"The context is already capturing", nil);
|
|
140
96
|
return;
|
|
141
97
|
}
|
|
142
98
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
99
|
+
OSStatus status = [context transcribeRealtime:jobId
|
|
100
|
+
options:options
|
|
101
|
+
onTranscribe:^(int _jobId, NSString *type, NSDictionary *payload) {
|
|
102
|
+
NSString *eventName = nil;
|
|
103
|
+
if ([type isEqual:@"transcribe"]) {
|
|
104
|
+
eventName = @"@RNWhisper_onRealtimeTranscribe";
|
|
105
|
+
} else if ([type isEqual:@"end"]) {
|
|
106
|
+
eventName = @"@RNWhisper_onRealtimeTranscribeEnd";
|
|
107
|
+
}
|
|
108
|
+
if (eventName == nil) {
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
[self sendEventWithName:eventName
|
|
112
|
+
body:@{
|
|
113
|
+
@"contextId": [NSNumber numberWithInt:contextId],
|
|
114
|
+
@"jobId": [NSNumber numberWithInt:jobId],
|
|
115
|
+
@"payload": payload
|
|
116
|
+
}
|
|
117
|
+
];
|
|
118
|
+
}
|
|
119
|
+
];
|
|
120
|
+
if (status == 0) {
|
|
121
|
+
resolve(nil);
|
|
122
|
+
return;
|
|
162
123
|
}
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
124
|
+
reject(@"whisper_error", [NSString stringWithFormat:@"Failed to start realtime transcribe. Status: %d", status], nil);
|
|
125
|
+
}
|
|
126
|
+
RCT_REMAP_METHOD(abortTranscribe,
|
|
127
|
+
withContextId:(int)contextId
|
|
128
|
+
withJobId:(int)jobId)
|
|
129
|
+
{
|
|
130
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
131
|
+
[context stopTranscribe:jobId];
|
|
167
132
|
}
|
|
168
133
|
|
|
169
134
|
RCT_REMAP_METHOD(releaseContext,
|
|
@@ -171,7 +136,7 @@ RCT_REMAP_METHOD(releaseContext,
|
|
|
171
136
|
withResolver:(RCTPromiseResolveBlock)resolve
|
|
172
137
|
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
173
138
|
{
|
|
174
|
-
|
|
139
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
175
140
|
if (context == nil) {
|
|
176
141
|
reject(@"whisper_error", @"Context not found", nil);
|
|
177
142
|
return;
|
|
@@ -210,12 +175,14 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
210
175
|
}
|
|
211
176
|
|
|
212
177
|
- (void)invalidate {
|
|
178
|
+
rn_whisper_abort_all_transcribe();
|
|
179
|
+
|
|
213
180
|
if (contexts == nil) {
|
|
214
181
|
return;
|
|
215
182
|
}
|
|
216
183
|
|
|
217
184
|
for (NSNumber *contextId in contexts) {
|
|
218
|
-
|
|
185
|
+
RNWhisperContext *context = contexts[contextId];
|
|
219
186
|
[context invalidate];
|
|
220
187
|
}
|
|
221
188
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#ifdef __cplusplus
|
|
2
|
+
#import "whisper.h"
|
|
3
|
+
#import "rn-whisper.h"
|
|
4
|
+
#endif
|
|
5
|
+
|
|
6
|
+
#import <AVFoundation/AVFoundation.h>
|
|
7
|
+
#import <AudioToolbox/AudioQueue.h>
|
|
8
|
+
|
|
9
|
+
#define NUM_BUFFERS 3
|
|
10
|
+
#define DEFAULT_MAX_AUDIO_SEC 30
|
|
11
|
+
|
|
12
|
+
typedef struct {
|
|
13
|
+
__unsafe_unretained id mSelf;
|
|
14
|
+
|
|
15
|
+
int jobId;
|
|
16
|
+
NSDictionary* options;
|
|
17
|
+
|
|
18
|
+
bool isTranscribing;
|
|
19
|
+
bool isRealtime;
|
|
20
|
+
bool isCapturing;
|
|
21
|
+
bool isStoppedByAction;
|
|
22
|
+
int maxAudioSec;
|
|
23
|
+
int nSamples;
|
|
24
|
+
int nSamplesTranscribing;
|
|
25
|
+
int16_t* audioBufferI16;
|
|
26
|
+
float* audioBufferF32;
|
|
27
|
+
|
|
28
|
+
AudioQueueRef queue;
|
|
29
|
+
AudioStreamBasicDescription dataFormat;
|
|
30
|
+
AudioQueueBufferRef buffers[NUM_BUFFERS];
|
|
31
|
+
|
|
32
|
+
void (^transcribeHandler)(int, NSString *, NSDictionary *);
|
|
33
|
+
} RNWhisperContextRecordState;
|
|
34
|
+
|
|
35
|
+
@interface RNWhisperContext : NSObject {
|
|
36
|
+
struct whisper_context * ctx;
|
|
37
|
+
RNWhisperContextRecordState recordState;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath;
|
|
41
|
+
- (struct whisper_context *)getContext;
|
|
42
|
+
- (OSStatus)transcribeRealtime:(int)jobId
|
|
43
|
+
options:(NSDictionary *)options
|
|
44
|
+
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe;
|
|
45
|
+
- (int)transcribeFile:(int)jobId
|
|
46
|
+
audioData:(float *)audioData
|
|
47
|
+
audioDataCount:(int)audioDataCount
|
|
48
|
+
options:(NSDictionary *)options;
|
|
49
|
+
- (void)stopTranscribe:(int)jobId;
|
|
50
|
+
- (bool)isCapturing;
|
|
51
|
+
- (bool)isTranscribing;
|
|
52
|
+
- (NSDictionary *)getTextSegments;
|
|
53
|
+
- (void)invalidate;
|
|
54
|
+
|
|
55
|
+
@end
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
#import "RNWhisperContext.h"
|
|
2
|
+
|
|
3
|
+
#define NUM_BYTES_PER_BUFFER 16 * 1024
|
|
4
|
+
|
|
5
|
+
@implementation RNWhisperContext
|
|
6
|
+
|
|
7
|
+
+ (instancetype)initWithModelPath:(NSString *)modelPath {
|
|
8
|
+
RNWhisperContext *context = [[RNWhisperContext alloc] init];
|
|
9
|
+
context->ctx = whisper_init_from_file([modelPath UTF8String]);
|
|
10
|
+
return context;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
- (struct whisper_context *)getContext {
|
|
14
|
+
return self->ctx;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
- (void)prepareRealtime:(NSDictionary *)options {
|
|
18
|
+
self->recordState.options = options;
|
|
19
|
+
|
|
20
|
+
self->recordState.dataFormat.mSampleRate = WHISPER_SAMPLE_RATE; // 16000
|
|
21
|
+
self->recordState.dataFormat.mFormatID = kAudioFormatLinearPCM;
|
|
22
|
+
self->recordState.dataFormat.mFramesPerPacket = 1;
|
|
23
|
+
self->recordState.dataFormat.mChannelsPerFrame = 1; // mono
|
|
24
|
+
self->recordState.dataFormat.mBytesPerFrame = 2;
|
|
25
|
+
self->recordState.dataFormat.mBytesPerPacket = 2;
|
|
26
|
+
self->recordState.dataFormat.mBitsPerChannel = 16;
|
|
27
|
+
self->recordState.dataFormat.mReserved = 0;
|
|
28
|
+
self->recordState.dataFormat.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
29
|
+
|
|
30
|
+
self->recordState.nSamples = 0;
|
|
31
|
+
|
|
32
|
+
int maxAudioSecOpt = options[@"realtimeAudioSec"] != nil ? [options[@"realtimeAudioSec"] intValue] : 0;
|
|
33
|
+
int maxAudioSec = maxAudioSecOpt > 0 ? maxAudioSecOpt : DEFAULT_MAX_AUDIO_SEC;
|
|
34
|
+
self->recordState.maxAudioSec = maxAudioSec;
|
|
35
|
+
self->recordState.audioBufferI16 = (int16_t *) malloc(maxAudioSec * WHISPER_SAMPLE_RATE * sizeof(int16_t));
|
|
36
|
+
self->recordState.audioBufferF32 = (float *) malloc(maxAudioSec * WHISPER_SAMPLE_RATE * sizeof(float));
|
|
37
|
+
|
|
38
|
+
self->recordState.isRealtime = true;
|
|
39
|
+
self->recordState.isTranscribing = false;
|
|
40
|
+
self->recordState.isCapturing = false;
|
|
41
|
+
|
|
42
|
+
self->recordState.mSelf = self;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
void AudioInputCallback(void * inUserData,
|
|
46
|
+
AudioQueueRef inAQ,
|
|
47
|
+
AudioQueueBufferRef inBuffer,
|
|
48
|
+
const AudioTimeStamp * inStartTime,
|
|
49
|
+
UInt32 inNumberPacketDescriptions,
|
|
50
|
+
const AudioStreamPacketDescription * inPacketDescs)
|
|
51
|
+
{
|
|
52
|
+
RNWhisperContextRecordState *state = (RNWhisperContextRecordState *)inUserData;
|
|
53
|
+
|
|
54
|
+
if (!state->isCapturing) {
|
|
55
|
+
NSLog(@"[RNWhisper] Not capturing, ignoring audio");
|
|
56
|
+
if (!state->isTranscribing) {
|
|
57
|
+
state->transcribeHandler(state->jobId, @"end", @{});
|
|
58
|
+
}
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
63
|
+
NSLog(@"[RNWhisper] Captured %d new samples", n);
|
|
64
|
+
|
|
65
|
+
if (state->nSamples + n > state->maxAudioSec * WHISPER_SAMPLE_RATE) {
|
|
66
|
+
NSLog(@"[RNWhisper] Audio buffer is full, stop capturing");
|
|
67
|
+
state->isCapturing = false;
|
|
68
|
+
[state->mSelf stopAudio];
|
|
69
|
+
if (!state->isTranscribing && state->nSamples == state->nSamplesTranscribing) {
|
|
70
|
+
state->transcribeHandler(state->jobId, @"end", @{});
|
|
71
|
+
} else if (!state->isTranscribing && state->nSamples != state->nSamplesTranscribing) {
|
|
72
|
+
state->isTranscribing = true;
|
|
73
|
+
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
|
74
|
+
[state->mSelf fullTranscribeSamples:state];
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
for (int i = 0; i < n; i++) {
|
|
81
|
+
state->audioBufferI16[state->nSamples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
82
|
+
}
|
|
83
|
+
state->nSamples += n;
|
|
84
|
+
|
|
85
|
+
AudioQueueEnqueueBuffer(state->queue, inBuffer, 0, NULL);
|
|
86
|
+
|
|
87
|
+
if (!state->isTranscribing) {
|
|
88
|
+
state->isTranscribing = true;
|
|
89
|
+
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
|
90
|
+
[state->mSelf fullTranscribeSamples:state];
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
- (void)fullTranscribeSamples:(RNWhisperContextRecordState*) state {
|
|
96
|
+
state->nSamplesTranscribing = state->nSamples;
|
|
97
|
+
NSLog(@"[RNWhisper] Transcribing %d samples", state->nSamplesTranscribing);
|
|
98
|
+
|
|
99
|
+
// convert I16 to F32
|
|
100
|
+
for (int i = 0; i < state->nSamplesTranscribing; i++) {
|
|
101
|
+
state->audioBufferF32[i] = (float)state->audioBufferI16[i] / 32768.0f;
|
|
102
|
+
}
|
|
103
|
+
CFTimeInterval timeStart = CACurrentMediaTime();
|
|
104
|
+
int code = [state->mSelf fullTranscribe:state->jobId audioData:state->audioBufferF32 audioDataCount:state->nSamplesTranscribing options:state->options];
|
|
105
|
+
CFTimeInterval timeEnd = CACurrentMediaTime();
|
|
106
|
+
const float timeRecording = (float) state->nSamplesTranscribing / (float) state->dataFormat.mSampleRate;
|
|
107
|
+
|
|
108
|
+
NSDictionary* base = @{
|
|
109
|
+
@"code": [NSNumber numberWithInt:code],
|
|
110
|
+
@"processTime": [NSNumber numberWithInt:(timeEnd - timeStart) * 1E3],
|
|
111
|
+
@"recordingTime": [NSNumber numberWithInt:timeRecording * 1E3],
|
|
112
|
+
};
|
|
113
|
+
NSMutableDictionary* result = [base mutableCopy];
|
|
114
|
+
|
|
115
|
+
if (code == 0) {
|
|
116
|
+
result[@"data"] = [state->mSelf getTextSegments];
|
|
117
|
+
} else {
|
|
118
|
+
result[@"error"] = [NSString stringWithFormat:@"Transcribe failed with code %d", code];
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (state->isStoppedByAction || (!state->isCapturing && state->nSamplesTranscribing == state->nSamples)) {
|
|
122
|
+
NSLog(@"[RNWhisper] Transcribe end");
|
|
123
|
+
result[@"isStoppedByAction"] = @(state->isStoppedByAction);
|
|
124
|
+
result[@"isCapturing"] = @(false);
|
|
125
|
+
state->transcribeHandler(state->jobId, @"end", result);
|
|
126
|
+
} else if (code == 0) {
|
|
127
|
+
result[@"isCapturing"] = @(true);
|
|
128
|
+
state->transcribeHandler(state->jobId, @"transcribe", result);
|
|
129
|
+
} else {
|
|
130
|
+
result[@"isCapturing"] = @(true);
|
|
131
|
+
state->transcribeHandler(state->jobId, @"transcribe", result);
|
|
132
|
+
}
|
|
133
|
+
state->isTranscribing = false;
|
|
134
|
+
|
|
135
|
+
if (!state->isCapturing && state->nSamplesTranscribing != state->nSamples) {
|
|
136
|
+
state->isTranscribing = true;
|
|
137
|
+
// Finish transcribing the rest of the samples
|
|
138
|
+
[self fullTranscribeSamples:state];
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
- (bool)isCapturing {
|
|
143
|
+
return self->recordState.isCapturing;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
- (bool)isTranscribing {
|
|
147
|
+
return self->recordState.isTranscribing;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
- (OSStatus)transcribeRealtime:(int)jobId
|
|
151
|
+
options:(NSDictionary *)options
|
|
152
|
+
onTranscribe:(void (^)(int, NSString *, NSDictionary *))onTranscribe
|
|
153
|
+
{
|
|
154
|
+
self->recordState.transcribeHandler = onTranscribe;
|
|
155
|
+
self->recordState.jobId = jobId;
|
|
156
|
+
[self prepareRealtime:options];
|
|
157
|
+
self->recordState.nSamples = 0;
|
|
158
|
+
|
|
159
|
+
OSStatus status = AudioQueueNewInput(
|
|
160
|
+
&self->recordState.dataFormat,
|
|
161
|
+
AudioInputCallback,
|
|
162
|
+
&self->recordState,
|
|
163
|
+
NULL,
|
|
164
|
+
kCFRunLoopCommonModes,
|
|
165
|
+
0,
|
|
166
|
+
&self->recordState.queue
|
|
167
|
+
);
|
|
168
|
+
|
|
169
|
+
if (status == 0) {
|
|
170
|
+
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
171
|
+
AudioQueueAllocateBuffer(self->recordState.queue, NUM_BYTES_PER_BUFFER, &self->recordState.buffers[i]);
|
|
172
|
+
AudioQueueEnqueueBuffer(self->recordState.queue, self->recordState.buffers[i], 0, NULL);
|
|
173
|
+
}
|
|
174
|
+
status = AudioQueueStart(self->recordState.queue, NULL);
|
|
175
|
+
if (status == 0) {
|
|
176
|
+
self->recordState.isCapturing = true;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return status;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
- (int)transcribeFile:(int)jobId
|
|
183
|
+
audioData:(float *)audioData
|
|
184
|
+
audioDataCount:(int)audioDataCount
|
|
185
|
+
options:(NSDictionary *)options
|
|
186
|
+
{
|
|
187
|
+
self->recordState.isTranscribing = true;
|
|
188
|
+
self->recordState.jobId = jobId;
|
|
189
|
+
int code = [self fullTranscribe:jobId audioData:audioData audioDataCount:audioDataCount options:options];
|
|
190
|
+
self->recordState.jobId = -1;
|
|
191
|
+
self->recordState.isTranscribing = false;
|
|
192
|
+
return code;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
- (void)stopAudio {
|
|
196
|
+
AudioQueueStop(self->recordState.queue, true);
|
|
197
|
+
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
198
|
+
AudioQueueFreeBuffer(self->recordState.queue, self->recordState.buffers[i]);
|
|
199
|
+
}
|
|
200
|
+
AudioQueueDispose(self->recordState.queue, true);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
- (void)stopTranscribe:(int)jobId {
|
|
204
|
+
rn_whisper_abort_transcribe(jobId);
|
|
205
|
+
if (!self->recordState.isRealtime || !self->recordState.isCapturing) {
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
self->recordState.isCapturing = false;
|
|
209
|
+
self->recordState.isStoppedByAction = true;
|
|
210
|
+
[self stopAudio];
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
- (void)stopCurrentTranscribe {
|
|
214
|
+
if (!self->recordState.jobId) {
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
[self stopTranscribe:self->recordState.jobId];
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
- (int)fullTranscribe:(int)jobId audioData:(float *)audioData audioDataCount:(int)audioDataCount options:(NSDictionary *)options {
|
|
221
|
+
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
222
|
+
|
|
223
|
+
const int max_threads = options[@"maxThreads"] != nil ?
|
|
224
|
+
[options[@"maxThreads"] intValue] :
|
|
225
|
+
MIN(4, (int)[[NSProcessInfo processInfo] processorCount]);
|
|
226
|
+
|
|
227
|
+
if (options[@"beamSize"] != nil) {
|
|
228
|
+
params.strategy = WHISPER_SAMPLING_BEAM_SEARCH;
|
|
229
|
+
params.beam_search.beam_size = [options[@"beamSize"] intValue];
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
params.print_realtime = false;
|
|
233
|
+
params.print_progress = false;
|
|
234
|
+
params.print_timestamps = false;
|
|
235
|
+
params.print_special = false;
|
|
236
|
+
params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false;
|
|
237
|
+
params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false;
|
|
238
|
+
params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto";
|
|
239
|
+
params.n_threads = max_threads;
|
|
240
|
+
params.offset_ms = 0;
|
|
241
|
+
params.no_context = true;
|
|
242
|
+
params.single_segment = self->recordState.isRealtime;
|
|
243
|
+
|
|
244
|
+
if (options[@"maxLen"] != nil) {
|
|
245
|
+
params.max_len = [options[@"maxLen"] intValue];
|
|
246
|
+
}
|
|
247
|
+
params.token_timestamps = options[@"tokenTimestamps"] != nil ? [options[@"tokenTimestamps"] boolValue] : false;
|
|
248
|
+
|
|
249
|
+
if (options[@"bestOf"] != nil) {
|
|
250
|
+
params.greedy.best_of = [options[@"bestOf"] intValue];
|
|
251
|
+
}
|
|
252
|
+
if (options[@"maxContext"] != nil) {
|
|
253
|
+
params.n_max_text_ctx = [options[@"maxContext"] intValue];
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if (options[@"offset"] != nil) {
|
|
257
|
+
params.offset_ms = [options[@"offset"] intValue];
|
|
258
|
+
}
|
|
259
|
+
if (options[@"duration"] != nil) {
|
|
260
|
+
params.duration_ms = [options[@"duration"] intValue];
|
|
261
|
+
}
|
|
262
|
+
if (options[@"wordThold"] != nil) {
|
|
263
|
+
params.thold_pt = [options[@"wordThold"] intValue];
|
|
264
|
+
}
|
|
265
|
+
if (options[@"temperature"] != nil) {
|
|
266
|
+
params.temperature = [options[@"temperature"] floatValue];
|
|
267
|
+
}
|
|
268
|
+
if (options[@"temperatureInc"] != nil) {
|
|
269
|
+
params.temperature_inc = [options[@"temperature_inc"] floatValue];
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (options[@"prompt"] != nil) {
|
|
273
|
+
std::string *prompt = new std::string([options[@"prompt"] UTF8String]);
|
|
274
|
+
rn_whisper_convert_prompt(
|
|
275
|
+
self->ctx,
|
|
276
|
+
params,
|
|
277
|
+
prompt
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
282
|
+
bool is_aborted = *(bool*)user_data;
|
|
283
|
+
return !is_aborted;
|
|
284
|
+
};
|
|
285
|
+
params.encoder_begin_callback_user_data = rn_whisper_assign_abort_map(jobId);
|
|
286
|
+
|
|
287
|
+
whisper_reset_timings(self->ctx);
|
|
288
|
+
|
|
289
|
+
int code = whisper_full(self->ctx, params, audioData, audioDataCount);
|
|
290
|
+
rn_whisper_remove_abort_map(jobId);
|
|
291
|
+
// if (code == 0) {
|
|
292
|
+
// whisper_print_timings(self->ctx);
|
|
293
|
+
// }
|
|
294
|
+
return code;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
- (NSDictionary *)getTextSegments {
|
|
298
|
+
NSString *result = @"";
|
|
299
|
+
int n_segments = whisper_full_n_segments(self->ctx);
|
|
300
|
+
|
|
301
|
+
NSMutableArray *segments = [[NSMutableArray alloc] init];
|
|
302
|
+
for (int i = 0; i < n_segments; i++) {
|
|
303
|
+
const char * text_cur = whisper_full_get_segment_text(self->ctx, i);
|
|
304
|
+
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
|
305
|
+
|
|
306
|
+
const int64_t t0 = whisper_full_get_segment_t0(self->ctx, i);
|
|
307
|
+
const int64_t t1 = whisper_full_get_segment_t1(self->ctx, i);
|
|
308
|
+
NSDictionary *segment = @{
|
|
309
|
+
@"text": [NSString stringWithUTF8String:text_cur],
|
|
310
|
+
@"t0": [NSNumber numberWithLongLong:t0],
|
|
311
|
+
@"t1": [NSNumber numberWithLongLong:t1]
|
|
312
|
+
};
|
|
313
|
+
[segments addObject:segment];
|
|
314
|
+
}
|
|
315
|
+
return @{
|
|
316
|
+
@"result": result,
|
|
317
|
+
@"segments": segments
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
- (void)invalidate {
|
|
322
|
+
[self stopCurrentTranscribe];
|
|
323
|
+
whisper_free(self->ctx);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
@end
|