whisper.rn 0.4.0-rc.1 → 0.4.0-rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/android/build.gradle +4 -0
- package/android/src/main/CMakeLists.txt +21 -1
- package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
- package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
- package/android/src/main/jni-utils.h +76 -0
- package/android/src/main/jni.cpp +226 -109
- package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
- package/cpp/coreml/whisper-encoder-impl.h +1 -1
- package/cpp/coreml/whisper-encoder.h +4 -0
- package/cpp/coreml/whisper-encoder.mm +5 -3
- package/cpp/ggml-alloc.c +797 -400
- package/cpp/ggml-alloc.h +60 -10
- package/cpp/ggml-backend-impl.h +255 -0
- package/cpp/ggml-backend-reg.cpp +582 -0
- package/cpp/ggml-backend.cpp +2002 -0
- package/cpp/ggml-backend.h +354 -0
- package/cpp/ggml-common.h +1851 -0
- package/cpp/ggml-cpp.h +39 -0
- package/cpp/ggml-cpu-aarch64.cpp +4247 -0
- package/cpp/ggml-cpu-aarch64.h +8 -0
- package/cpp/ggml-cpu-impl.h +531 -0
- package/cpp/ggml-cpu-quants.c +12245 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14792 -0
- package/cpp/ggml-cpu.cpp +653 -0
- package/cpp/ggml-cpu.h +137 -0
- package/cpp/ggml-impl.h +567 -0
- package/cpp/ggml-metal-impl.h +288 -0
- package/cpp/ggml-metal.h +24 -43
- package/cpp/ggml-metal.m +4867 -1080
- package/cpp/ggml-opt.cpp +854 -0
- package/cpp/ggml-opt.h +216 -0
- package/cpp/ggml-quants.c +5238 -0
- package/cpp/ggml-quants.h +100 -0
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +14 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +5106 -19431
- package/cpp/ggml.h +847 -669
- package/cpp/gguf.cpp +1329 -0
- package/cpp/gguf.h +202 -0
- package/cpp/rn-audioutils.cpp +68 -0
- package/cpp/rn-audioutils.h +14 -0
- package/cpp/rn-whisper-log.h +11 -0
- package/cpp/rn-whisper.cpp +221 -52
- package/cpp/rn-whisper.h +50 -15
- package/cpp/whisper.cpp +3174 -1533
- package/cpp/whisper.h +176 -44
- package/ios/RNWhisper.mm +139 -46
- package/ios/RNWhisperAudioUtils.h +1 -2
- package/ios/RNWhisperAudioUtils.m +18 -67
- package/ios/RNWhisperContext.h +11 -8
- package/ios/RNWhisperContext.mm +195 -150
- package/jest/mock.js +15 -2
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/index.js +76 -28
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/index.js +76 -28
- package/lib/module/index.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +13 -4
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +37 -5
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +9 -7
- package/src/NativeRNWhisper.ts +20 -4
- package/src/index.ts +98 -42
- package/src/version.json +1 -1
- package/whisper-rn.podspec +13 -20
- package/cpp/README.md +0 -4
- package/cpp/ggml-metal.metal +0 -2353
package/cpp/whisper.h
CHANGED
|
@@ -1,10 +1,21 @@
|
|
|
1
1
|
#ifndef WHISPER_H
|
|
2
2
|
#define WHISPER_H
|
|
3
3
|
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
#include "ggml-cpu.h"
|
|
6
|
+
|
|
4
7
|
#include <stddef.h>
|
|
5
8
|
#include <stdint.h>
|
|
6
9
|
#include <stdbool.h>
|
|
7
10
|
|
|
11
|
+
#ifdef __GNUC__
|
|
12
|
+
# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
|
13
|
+
#elif defined(_MSC_VER)
|
|
14
|
+
# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
|
15
|
+
#else
|
|
16
|
+
# define WHISPER_DEPRECATED(func, hint) func
|
|
17
|
+
#endif
|
|
18
|
+
|
|
8
19
|
#ifdef WHISPER_SHARED
|
|
9
20
|
# ifdef _WIN32
|
|
10
21
|
# ifdef WHISPER_BUILD
|
|
@@ -21,7 +32,6 @@
|
|
|
21
32
|
|
|
22
33
|
#define WHISPER_SAMPLE_RATE 16000
|
|
23
34
|
#define WHISPER_N_FFT 400
|
|
24
|
-
#define WHISPER_N_MEL 80
|
|
25
35
|
#define WHISPER_HOP_LENGTH 160
|
|
26
36
|
#define WHISPER_CHUNK_SIZE 30
|
|
27
37
|
|
|
@@ -41,7 +51,9 @@ extern "C" {
|
|
|
41
51
|
//
|
|
42
52
|
// ...
|
|
43
53
|
//
|
|
44
|
-
//
|
|
54
|
+
// whisper_context_params cparams = whisper_context_default_params();
|
|
55
|
+
//
|
|
56
|
+
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
|
45
57
|
//
|
|
46
58
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
47
59
|
// fprintf(stderr, "failed to process audio\n");
|
|
@@ -69,7 +81,53 @@ extern "C" {
|
|
|
69
81
|
struct whisper_state;
|
|
70
82
|
struct whisper_full_params;
|
|
71
83
|
|
|
72
|
-
typedef
|
|
84
|
+
typedef int32_t whisper_pos;
|
|
85
|
+
typedef int32_t whisper_token;
|
|
86
|
+
typedef int32_t whisper_seq_id;
|
|
87
|
+
|
|
88
|
+
enum whisper_alignment_heads_preset {
|
|
89
|
+
WHISPER_AHEADS_NONE,
|
|
90
|
+
WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
|
|
91
|
+
WHISPER_AHEADS_CUSTOM,
|
|
92
|
+
WHISPER_AHEADS_TINY_EN,
|
|
93
|
+
WHISPER_AHEADS_TINY,
|
|
94
|
+
WHISPER_AHEADS_BASE_EN,
|
|
95
|
+
WHISPER_AHEADS_BASE,
|
|
96
|
+
WHISPER_AHEADS_SMALL_EN,
|
|
97
|
+
WHISPER_AHEADS_SMALL,
|
|
98
|
+
WHISPER_AHEADS_MEDIUM_EN,
|
|
99
|
+
WHISPER_AHEADS_MEDIUM,
|
|
100
|
+
WHISPER_AHEADS_LARGE_V1,
|
|
101
|
+
WHISPER_AHEADS_LARGE_V2,
|
|
102
|
+
WHISPER_AHEADS_LARGE_V3,
|
|
103
|
+
WHISPER_AHEADS_LARGE_V3_TURBO,
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
typedef struct whisper_ahead {
|
|
107
|
+
int n_text_layer;
|
|
108
|
+
int n_head;
|
|
109
|
+
} whisper_ahead;
|
|
110
|
+
|
|
111
|
+
typedef struct whisper_aheads {
|
|
112
|
+
size_t n_heads;
|
|
113
|
+
const whisper_ahead * heads;
|
|
114
|
+
} whisper_aheads;
|
|
115
|
+
|
|
116
|
+
struct whisper_context_params {
|
|
117
|
+
bool use_gpu;
|
|
118
|
+
bool use_coreml;
|
|
119
|
+
bool flash_attn;
|
|
120
|
+
int gpu_device; // CUDA device
|
|
121
|
+
|
|
122
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
123
|
+
bool dtw_token_timestamps;
|
|
124
|
+
enum whisper_alignment_heads_preset dtw_aheads_preset;
|
|
125
|
+
|
|
126
|
+
int dtw_n_top;
|
|
127
|
+
struct whisper_aheads dtw_aheads;
|
|
128
|
+
|
|
129
|
+
size_t dtw_mem_size; // TODO: remove
|
|
130
|
+
};
|
|
73
131
|
|
|
74
132
|
typedef struct whisper_token_data {
|
|
75
133
|
whisper_token id; // token id
|
|
@@ -85,6 +143,11 @@ extern "C" {
|
|
|
85
143
|
int64_t t0; // start time of the token
|
|
86
144
|
int64_t t1; // end time of the token
|
|
87
145
|
|
|
146
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
|
147
|
+
// do not use if you haven't computed token-level timestamps with dtw
|
|
148
|
+
// Roughly corresponds to the moment in audio in which the token was output
|
|
149
|
+
int64_t t_dtw;
|
|
150
|
+
|
|
88
151
|
float vlen; // voice length of the token
|
|
89
152
|
} whisper_token_data;
|
|
90
153
|
|
|
@@ -96,21 +159,74 @@ extern "C" {
|
|
|
96
159
|
void (*close)(void * ctx);
|
|
97
160
|
} whisper_model_loader;
|
|
98
161
|
|
|
162
|
+
// grammar element type
|
|
163
|
+
enum whisper_gretype {
|
|
164
|
+
// end of rule definition
|
|
165
|
+
WHISPER_GRETYPE_END = 0,
|
|
166
|
+
|
|
167
|
+
// start of alternate definition for rule
|
|
168
|
+
WHISPER_GRETYPE_ALT = 1,
|
|
169
|
+
|
|
170
|
+
// non-terminal element: reference to rule
|
|
171
|
+
WHISPER_GRETYPE_RULE_REF = 2,
|
|
172
|
+
|
|
173
|
+
// terminal element: character (code point)
|
|
174
|
+
WHISPER_GRETYPE_CHAR = 3,
|
|
175
|
+
|
|
176
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
|
177
|
+
WHISPER_GRETYPE_CHAR_NOT = 4,
|
|
178
|
+
|
|
179
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
|
180
|
+
// be an inclusive range ([a-z])
|
|
181
|
+
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
|
|
182
|
+
|
|
183
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or
|
|
184
|
+
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
|
185
|
+
WHISPER_GRETYPE_CHAR_ALT = 6,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
typedef struct whisper_grammar_element {
|
|
189
|
+
enum whisper_gretype type;
|
|
190
|
+
uint32_t value; // Unicode code point or rule ID
|
|
191
|
+
} whisper_grammar_element;
|
|
192
|
+
|
|
99
193
|
// Various functions for loading a ggml whisper model.
|
|
100
194
|
// Allocate (almost) all memory needed for the model.
|
|
101
195
|
// Return NULL on failure
|
|
102
|
-
|
|
103
|
-
WHISPER_API struct whisper_context *
|
|
104
|
-
|
|
105
|
-
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
|
|
106
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
|
|
107
|
-
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
|
|
196
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
|
|
197
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
198
|
+
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
108
199
|
|
|
109
200
|
// These are the same as the above, but the internal state of the context is not allocated automatically
|
|
110
201
|
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
|
111
|
-
WHISPER_API struct whisper_context *
|
|
112
|
-
WHISPER_API struct whisper_context *
|
|
113
|
-
WHISPER_API struct whisper_context *
|
|
202
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
|
|
203
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
|
204
|
+
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
|
|
205
|
+
|
|
206
|
+
WHISPER_DEPRECATED(
|
|
207
|
+
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
|
|
208
|
+
"use whisper_init_from_file_with_params instead"
|
|
209
|
+
);
|
|
210
|
+
WHISPER_DEPRECATED(
|
|
211
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
|
|
212
|
+
"use whisper_init_from_buffer_with_params instead"
|
|
213
|
+
);
|
|
214
|
+
WHISPER_DEPRECATED(
|
|
215
|
+
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
|
|
216
|
+
"use whisper_init_with_params instead"
|
|
217
|
+
);
|
|
218
|
+
WHISPER_DEPRECATED(
|
|
219
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
|
|
220
|
+
"use whisper_init_from_file_with_params_no_state instead"
|
|
221
|
+
);
|
|
222
|
+
WHISPER_DEPRECATED(
|
|
223
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
|
|
224
|
+
"use whisper_init_from_buffer_with_params_no_state instead"
|
|
225
|
+
);
|
|
226
|
+
WHISPER_DEPRECATED(
|
|
227
|
+
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
|
|
228
|
+
"use whisper_init_with_params_no_state instead"
|
|
229
|
+
);
|
|
114
230
|
|
|
115
231
|
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
|
116
232
|
|
|
@@ -125,6 +241,13 @@ extern "C" {
|
|
|
125
241
|
// GPU, by caching compiled 'blobs' there.
|
|
126
242
|
// Set to nullptr if not used.
|
|
127
243
|
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
|
244
|
+
WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
|
|
245
|
+
struct whisper_context * ctx,
|
|
246
|
+
struct whisper_state * state,
|
|
247
|
+
const char * model_path,
|
|
248
|
+
const char * device,
|
|
249
|
+
const char * cache_dir);
|
|
250
|
+
|
|
128
251
|
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
|
129
252
|
struct whisper_context * ctx,
|
|
130
253
|
const char * model_path,
|
|
@@ -135,6 +258,7 @@ extern "C" {
|
|
|
135
258
|
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
|
136
259
|
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
|
137
260
|
WHISPER_API void whisper_free_params(struct whisper_full_params * params);
|
|
261
|
+
WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
|
|
138
262
|
|
|
139
263
|
// Convert RAW PCM audio to log mel spectrogram.
|
|
140
264
|
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
|
@@ -152,22 +276,6 @@ extern "C" {
|
|
|
152
276
|
int n_samples,
|
|
153
277
|
int n_threads);
|
|
154
278
|
|
|
155
|
-
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
|
156
|
-
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
|
157
|
-
// Returns 0 on success
|
|
158
|
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
|
159
|
-
struct whisper_context * ctx,
|
|
160
|
-
const float * samples,
|
|
161
|
-
int n_samples,
|
|
162
|
-
int n_threads);
|
|
163
|
-
|
|
164
|
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
|
165
|
-
struct whisper_context * ctx,
|
|
166
|
-
struct whisper_state * state,
|
|
167
|
-
const float * samples,
|
|
168
|
-
int n_samples,
|
|
169
|
-
int n_threads);
|
|
170
|
-
|
|
171
279
|
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
|
172
280
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
|
173
281
|
// n_mel must be 80
|
|
@@ -224,7 +332,7 @@ extern "C" {
|
|
|
224
332
|
// Convert the provided text into tokens.
|
|
225
333
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
|
226
334
|
// Returns the number of tokens on success, no more than n_max_tokens
|
|
227
|
-
// Returns
|
|
335
|
+
// Returns a negative number on failure - the number of tokens that would have been returned
|
|
228
336
|
// TODO: not sure if correct
|
|
229
337
|
WHISPER_API int whisper_tokenize(
|
|
230
338
|
struct whisper_context * ctx,
|
|
@@ -232,8 +340,12 @@ extern "C" {
|
|
|
232
340
|
whisper_token * tokens,
|
|
233
341
|
int n_max_tokens);
|
|
234
342
|
|
|
343
|
+
// Return the number of tokens in the provided text
|
|
344
|
+
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
|
|
345
|
+
int whisper_token_count(struct whisper_context * ctx, const char * text);
|
|
346
|
+
|
|
235
347
|
// Largest language id (i.e. number of available languages - 1)
|
|
236
|
-
WHISPER_API int whisper_lang_max_id();
|
|
348
|
+
WHISPER_API int whisper_lang_max_id(void);
|
|
237
349
|
|
|
238
350
|
// Return the id of the specified language, returns -1 if not found
|
|
239
351
|
// Examples:
|
|
@@ -244,6 +356,9 @@ extern "C" {
|
|
|
244
356
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
|
245
357
|
WHISPER_API const char * whisper_lang_str(int id);
|
|
246
358
|
|
|
359
|
+
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
|
|
360
|
+
WHISPER_API const char * whisper_lang_str_full(int id);
|
|
361
|
+
|
|
247
362
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
|
248
363
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
|
249
364
|
// Returns the top language id or negative on failure
|
|
@@ -310,6 +425,14 @@ extern "C" {
|
|
|
310
425
|
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
|
311
426
|
|
|
312
427
|
// Performance information from the default state.
|
|
428
|
+
struct whisper_timings {
|
|
429
|
+
float sample_ms;
|
|
430
|
+
float encode_ms;
|
|
431
|
+
float decode_ms;
|
|
432
|
+
float batchd_ms;
|
|
433
|
+
float prompt_ms;
|
|
434
|
+
};
|
|
435
|
+
WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
|
|
313
436
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
|
314
437
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
|
315
438
|
|
|
@@ -337,11 +460,6 @@ extern "C" {
|
|
|
337
460
|
// If it returns false, the computation is aborted
|
|
338
461
|
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
|
|
339
462
|
|
|
340
|
-
// Abort callback
|
|
341
|
-
// If not NULL, called before ggml computation
|
|
342
|
-
// If it returns true, the computation is aborted
|
|
343
|
-
typedef bool (*whisper_abort_callback)(void * user_data);
|
|
344
|
-
|
|
345
463
|
// Logits filter callback
|
|
346
464
|
// Can be used to modify the logits before sampling
|
|
347
465
|
// If not NULL, called after applying temperature to logits
|
|
@@ -366,6 +484,7 @@ extern "C" {
|
|
|
366
484
|
|
|
367
485
|
bool translate;
|
|
368
486
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
|
487
|
+
bool no_timestamps; // do not generate timestamps
|
|
369
488
|
bool single_segment; // force single segment output (useful for streaming)
|
|
370
489
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
|
371
490
|
bool print_progress; // print progress information
|
|
@@ -382,15 +501,19 @@ extern "C" {
|
|
|
382
501
|
|
|
383
502
|
// [EXPERIMENTAL] speed-up techniques
|
|
384
503
|
// note: these can significantly reduce the quality of the output
|
|
385
|
-
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
|
386
504
|
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
|
387
505
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
|
388
506
|
|
|
389
507
|
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
|
390
508
|
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
|
391
509
|
|
|
510
|
+
// A regular expression that matches tokens to suppress
|
|
511
|
+
const char * suppress_regex;
|
|
512
|
+
|
|
392
513
|
// tokens to provide to the whisper decoder as initial prompt
|
|
393
514
|
// these are prepended to any existing text context from a previous call
|
|
515
|
+
// use whisper_tokenize() to convert text to tokens
|
|
516
|
+
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
|
394
517
|
const char * initial_prompt;
|
|
395
518
|
const whisper_token * prompt_tokens;
|
|
396
519
|
int prompt_n_tokens;
|
|
@@ -400,8 +523,8 @@ extern "C" {
|
|
|
400
523
|
bool detect_language;
|
|
401
524
|
|
|
402
525
|
// common decoding parameters:
|
|
403
|
-
bool suppress_blank;
|
|
404
|
-
bool
|
|
526
|
+
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
|
527
|
+
bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
|
405
528
|
|
|
406
529
|
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
|
|
407
530
|
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
|
@@ -412,7 +535,7 @@ extern "C" {
|
|
|
412
535
|
float temperature_inc;
|
|
413
536
|
float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
|
|
414
537
|
float logprob_thold;
|
|
415
|
-
float no_speech_thold;
|
|
538
|
+
float no_speech_thold;
|
|
416
539
|
|
|
417
540
|
struct {
|
|
418
541
|
int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
|
|
@@ -437,17 +560,24 @@ extern "C" {
|
|
|
437
560
|
void * encoder_begin_callback_user_data;
|
|
438
561
|
|
|
439
562
|
// called each time before ggml computation starts
|
|
440
|
-
|
|
563
|
+
wsp_ggml_abort_callback abort_callback;
|
|
441
564
|
void * abort_callback_user_data;
|
|
442
565
|
|
|
443
566
|
// called by each decoder to filter obtained logits
|
|
444
567
|
whisper_logits_filter_callback logits_filter_callback;
|
|
445
568
|
void * logits_filter_callback_user_data;
|
|
569
|
+
|
|
570
|
+
const whisper_grammar_element ** grammar_rules;
|
|
571
|
+
size_t n_grammar_rules;
|
|
572
|
+
size_t i_start_rule;
|
|
573
|
+
float grammar_penalty;
|
|
446
574
|
};
|
|
447
575
|
|
|
448
|
-
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
|
|
576
|
+
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
|
577
|
+
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
|
|
578
|
+
WHISPER_API struct whisper_context_params whisper_context_default_params (void);
|
|
449
579
|
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
|
|
450
|
-
WHISPER_API struct whisper_full_params
|
|
580
|
+
WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
|
|
451
581
|
|
|
452
582
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
|
453
583
|
// Not thread safe for same context
|
|
@@ -534,9 +664,11 @@ extern "C" {
|
|
|
534
664
|
|
|
535
665
|
// Control logging output; default behavior is to print to stderr
|
|
536
666
|
|
|
537
|
-
|
|
538
|
-
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
|
667
|
+
WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
|
|
539
668
|
|
|
669
|
+
// Get the no_speech probability for the specified segment
|
|
670
|
+
WHISPER_API float whisper_full_get_segment_no_speech_prob (struct whisper_context * ctx, int i_segment);
|
|
671
|
+
WHISPER_API float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment);
|
|
540
672
|
#ifdef __cplusplus
|
|
541
673
|
}
|
|
542
674
|
#endif
|
package/ios/RNWhisper.mm
CHANGED
|
@@ -48,7 +48,9 @@ RCT_REMAP_METHOD(initContext,
|
|
|
48
48
|
|
|
49
49
|
NSString *modelPath = [modelOptions objectForKey:@"filePath"];
|
|
50
50
|
BOOL isBundleAsset = [[modelOptions objectForKey:@"isBundleAsset"] boolValue];
|
|
51
|
+
BOOL useGpu = [[modelOptions objectForKey:@"useGpu"] boolValue];
|
|
51
52
|
BOOL useCoreMLIos = [[modelOptions objectForKey:@"useCoreMLIos"] boolValue];
|
|
53
|
+
BOOL useFlashAttn = [[modelOptions objectForKey:@"useFlashAttn"] boolValue];
|
|
52
54
|
|
|
53
55
|
// For support debug assets in development mode
|
|
54
56
|
BOOL downloadCoreMLAssets = [[modelOptions objectForKey:@"downloadCoreMLAssets"] boolValue];
|
|
@@ -77,6 +79,8 @@ RCT_REMAP_METHOD(initContext,
|
|
|
77
79
|
initWithModelPath:path
|
|
78
80
|
contextId:contextId
|
|
79
81
|
noCoreML:!useCoreMLIos
|
|
82
|
+
noMetal:!useGpu
|
|
83
|
+
useFlashAttn:useFlashAttn
|
|
80
84
|
];
|
|
81
85
|
if ([context getContext] == NULL) {
|
|
82
86
|
reject(@"whisper_cpp_error", @"Failed to load the model", nil);
|
|
@@ -85,7 +89,11 @@ RCT_REMAP_METHOD(initContext,
|
|
|
85
89
|
|
|
86
90
|
[contexts setObject:context forKey:[NSNumber numberWithInt:contextId]];
|
|
87
91
|
|
|
88
|
-
resolve(
|
|
92
|
+
resolve(@{
|
|
93
|
+
@"contextId": @(contextId),
|
|
94
|
+
@"gpu": @([context isMetalEnabled]),
|
|
95
|
+
@"reasonNoGPU": [context reasonNoMetal],
|
|
96
|
+
});
|
|
89
97
|
}
|
|
90
98
|
|
|
91
99
|
- (NSArray *)supportedEvents {
|
|
@@ -97,48 +105,23 @@ RCT_REMAP_METHOD(initContext,
|
|
|
97
105
|
];
|
|
98
106
|
}
|
|
99
107
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
108
|
+
- (void)transcribeData:(RNWhisperContext *)context
|
|
109
|
+
withContextId:(int)contextId
|
|
110
|
+
withJobId:(int)jobId
|
|
111
|
+
withData:(float *)data
|
|
112
|
+
withDataCount:(int)count
|
|
113
|
+
withOptions:(NSDictionary *)options
|
|
114
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
115
|
+
withRejecter:(RCTPromiseRejectBlock)reject
|
|
107
116
|
{
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if (context == nil) {
|
|
111
|
-
reject(@"whisper_error", @"Context not found", nil);
|
|
112
|
-
return;
|
|
113
|
-
}
|
|
114
|
-
if ([context isCapturing]) {
|
|
115
|
-
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
116
|
-
return;
|
|
117
|
-
}
|
|
118
|
-
if ([context isTranscribing]) {
|
|
119
|
-
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
120
|
-
return;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
NSString *path = waveFilePath;
|
|
124
|
-
if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
|
|
125
|
-
path = [RNWhisperDownloader downloadFile:path toFile:nil];
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
int count = 0;
|
|
129
|
-
float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
130
|
-
if (waveFile == nil) {
|
|
131
|
-
reject(@"whisper_error", @"Invalid file", nil);
|
|
132
|
-
return;
|
|
133
|
-
}
|
|
134
|
-
[context transcribeFile:jobId
|
|
135
|
-
audioData:waveFile
|
|
117
|
+
[context transcribeData:jobId
|
|
118
|
+
audioData:data
|
|
136
119
|
audioDataCount:count
|
|
137
120
|
options:options
|
|
138
121
|
onProgress: ^(int progress) {
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
122
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
123
|
+
if (job && job->is_aborted()) return;
|
|
124
|
+
|
|
142
125
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
143
126
|
[self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
|
|
144
127
|
body:@{
|
|
@@ -150,9 +133,9 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
150
133
|
});
|
|
151
134
|
}
|
|
152
135
|
onNewSegments: ^(NSDictionary *result) {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
136
|
+
rnwhisper::job* job = rnwhisper::job_get(jobId);
|
|
137
|
+
if (job && job->is_aborted()) return;
|
|
138
|
+
|
|
156
139
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
157
140
|
[self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
|
|
158
141
|
body:@{
|
|
@@ -164,12 +147,10 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
164
147
|
});
|
|
165
148
|
}
|
|
166
149
|
onEnd: ^(int code) {
|
|
167
|
-
if (code != 0) {
|
|
168
|
-
free(waveFile);
|
|
150
|
+
if (code != 0 && code != 999) {
|
|
169
151
|
reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
|
|
170
152
|
return;
|
|
171
153
|
}
|
|
172
|
-
free(waveFile);
|
|
173
154
|
NSMutableDictionary *result = [context getTextSegments];
|
|
174
155
|
result[@"isAborted"] = @([context isStoppedByAction]);
|
|
175
156
|
resolve(result);
|
|
@@ -177,6 +158,99 @@ RCT_REMAP_METHOD(transcribeFile,
|
|
|
177
158
|
];
|
|
178
159
|
}
|
|
179
160
|
|
|
161
|
+
RCT_REMAP_METHOD(transcribeFile,
|
|
162
|
+
withContextId:(int)contextId
|
|
163
|
+
withJobId:(int)jobId
|
|
164
|
+
withWaveFile:(NSString *)waveFilePathOrDataBase64
|
|
165
|
+
withOptions:(NSDictionary *)options
|
|
166
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
167
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
168
|
+
{
|
|
169
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
170
|
+
|
|
171
|
+
if (context == nil) {
|
|
172
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
if ([context isCapturing]) {
|
|
176
|
+
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
if ([context isTranscribing]) {
|
|
180
|
+
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
float *data = nil;
|
|
185
|
+
int count = 0;
|
|
186
|
+
if ([waveFilePathOrDataBase64 hasPrefix:@"http://"] || [waveFilePathOrDataBase64 hasPrefix:@"https://"]) {
|
|
187
|
+
NSString *path = [RNWhisperDownloader downloadFile:waveFilePathOrDataBase64 toFile:nil];
|
|
188
|
+
data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
|
|
189
|
+
} else if ([waveFilePathOrDataBase64 hasPrefix:@"data:audio/wav;base64,"]) {
|
|
190
|
+
NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[waveFilePathOrDataBase64 substringFromIndex:22] options:0];
|
|
191
|
+
data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
|
|
192
|
+
} else {
|
|
193
|
+
data = [RNWhisperAudioUtils decodeWaveFile:waveFilePathOrDataBase64 count:&count];
|
|
194
|
+
}
|
|
195
|
+
if (data == nil) {
|
|
196
|
+
reject(@"whisper_error", @"Invalid file", nil);
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
[self transcribeData:context
|
|
201
|
+
withContextId:contextId
|
|
202
|
+
withJobId:jobId
|
|
203
|
+
withData:data
|
|
204
|
+
withDataCount:count
|
|
205
|
+
withOptions:options
|
|
206
|
+
withResolver:resolve
|
|
207
|
+
withRejecter:reject
|
|
208
|
+
];
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
RCT_REMAP_METHOD(transcribeData,
|
|
212
|
+
withContextId:(int)contextId
|
|
213
|
+
withJobId:(int)jobId
|
|
214
|
+
withData:(NSString *)dataBase64 // pcm data base64 encoded
|
|
215
|
+
withOptions:(NSDictionary *)options
|
|
216
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
217
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
218
|
+
{
|
|
219
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
220
|
+
|
|
221
|
+
if (context == nil) {
|
|
222
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
223
|
+
return;
|
|
224
|
+
}
|
|
225
|
+
if ([context isCapturing]) {
|
|
226
|
+
reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
if ([context isTranscribing]) {
|
|
230
|
+
reject(@"whisper_error", @"Context is already transcribing", nil);
|
|
231
|
+
return;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:dataBase64 options:0];
|
|
235
|
+
int count = 0;
|
|
236
|
+
float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
|
|
237
|
+
|
|
238
|
+
if (data == nil) {
|
|
239
|
+
reject(@"whisper_error", @"Invalid data", nil);
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
[self transcribeData:context
|
|
244
|
+
withContextId:contextId
|
|
245
|
+
withJobId:jobId
|
|
246
|
+
withData:data
|
|
247
|
+
withDataCount:count
|
|
248
|
+
withOptions:options
|
|
249
|
+
withResolver:resolve
|
|
250
|
+
withRejecter:reject
|
|
251
|
+
];
|
|
252
|
+
}
|
|
253
|
+
|
|
180
254
|
RCT_REMAP_METHOD(startRealtimeTranscribe,
|
|
181
255
|
withContextId:(int)contextId
|
|
182
256
|
withJobId:(int)jobId
|
|
@@ -238,6 +312,25 @@ RCT_REMAP_METHOD(abortTranscribe,
|
|
|
238
312
|
resolve(nil);
|
|
239
313
|
}
|
|
240
314
|
|
|
315
|
+
RCT_REMAP_METHOD(bench,
|
|
316
|
+
withContextId:(int)contextId
|
|
317
|
+
withMaxThreads:(int)maxThreads
|
|
318
|
+
withResolver:(RCTPromiseResolveBlock)resolve
|
|
319
|
+
withRejecter:(RCTPromiseRejectBlock)reject)
|
|
320
|
+
{
|
|
321
|
+
RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
|
|
322
|
+
if (context == nil) {
|
|
323
|
+
reject(@"whisper_error", @"Context not found", nil);
|
|
324
|
+
return;
|
|
325
|
+
}
|
|
326
|
+
if ([context isTranscribing]) {
|
|
327
|
+
reject(@"whisper_error", @"The context is transcribing", nil);
|
|
328
|
+
return;
|
|
329
|
+
}
|
|
330
|
+
NSString *result = [context bench:maxThreads];
|
|
331
|
+
resolve(result);
|
|
332
|
+
}
|
|
333
|
+
|
|
241
334
|
RCT_REMAP_METHOD(releaseContext,
|
|
242
335
|
withContextId:(int)contextId
|
|
243
336
|
withResolver:(RCTPromiseResolveBlock)resolve
|
|
@@ -273,7 +366,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
|
|
|
273
366
|
[context invalidate];
|
|
274
367
|
}
|
|
275
368
|
|
|
276
|
-
|
|
369
|
+
rnwhisper::job_abort_all(); // graceful abort
|
|
277
370
|
|
|
278
371
|
[contexts removeAllObjects];
|
|
279
372
|
contexts = nil;
|
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
@interface RNWhisperAudioUtils : NSObject
|
|
4
4
|
|
|
5
|
-
+ (
|
|
6
|
-
+ (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
|
|
5
|
+
+ (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader;
|
|
7
6
|
+ (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
|
|
8
7
|
|
|
9
8
|
@end
|