whisper.rn 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/whisper.h ADDED
@@ -0,0 +1,393 @@
1
+ #ifndef WHISPER_H
2
+ #define WHISPER_H
3
+
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+ #include <stdbool.h>
7
+
8
+ #ifdef WHISPER_SHARED
9
+ # ifdef _WIN32
10
+ # ifdef WHISPER_BUILD
11
+ # define WHISPER_API __declspec(dllexport)
12
+ # else
13
+ # define WHISPER_API __declspec(dllimport)
14
+ # endif
15
+ # else
16
+ # define WHISPER_API __attribute__ ((visibility ("default")))
17
+ # endif
18
+ #else
19
+ # define WHISPER_API
20
+ #endif
21
+
22
+ #define WHISPER_SAMPLE_RATE 16000
23
+ #define WHISPER_N_FFT 400
24
+ #define WHISPER_N_MEL 80
25
+ #define WHISPER_HOP_LENGTH 160
26
+ #define WHISPER_CHUNK_SIZE 30
27
+
28
+ #ifdef __cplusplus
29
+ extern "C" {
30
+ #endif
31
+
32
+ //
33
+ // C interface
34
+ //
35
+ // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
36
+ // concurrently.
37
+ //
38
+ // Basic usage:
39
+ //
40
+ // #include "whisper.h"
41
+ //
42
+ // ...
43
+ //
44
+ // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
45
+ //
46
+ // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
47
+ // fprintf(stderr, "failed to process audio\n");
48
+ // return 7;
49
+ // }
50
+ //
51
+ // const int n_segments = whisper_full_n_segments(ctx);
52
+ // for (int i = 0; i < n_segments; ++i) {
53
+ // const char * text = whisper_full_get_segment_text(ctx, i);
54
+ // printf("%s", text);
55
+ // }
56
+ //
57
+ // whisper_free(ctx);
58
+ //
59
+ // ...
60
+ //
61
+ // This is a demonstration of the most straightforward usage of the library.
62
+ // "pcmf32" contains the RAW audio data in 32-bit floating point format.
63
+ //
64
+ // The interface also allows for more fine-grained control over the computation, but it requires a deeper
65
+ // understanding of how the model works.
66
+ //
67
+
68
+ struct whisper_context;
69
+
70
+ typedef int whisper_token;
71
+
72
+ typedef struct whisper_token_data {
73
+ whisper_token id; // token id
74
+ whisper_token tid; // forced timestamp token id
75
+
76
+ float p; // probability of the token
77
+ float plog; // log probability of the token
78
+ float pt; // probability of the timestamp token
79
+ float ptsum; // sum of probabilities of all timestamp tokens
80
+
81
+ // token-level timestamp data
82
+ // do not use if you haven't computed token-level timestamps
83
+ int64_t t0; // start time of the token
84
+ int64_t t1; // end time of the token
85
+
86
+ float vlen; // voice length of the token
87
+ } whisper_token_data;
88
+
89
+ typedef struct whisper_model_loader {
90
+ void * context;
91
+
92
+ size_t (*read)(void * ctx, void * output, size_t read_size);
93
+ bool (*eof)(void * ctx);
94
+ void (*close)(void * ctx);
95
+ } whisper_model_loader;
96
+
97
+ // Various functions for loading a ggml whisper model.
98
+ // Allocate (almost) all memory needed for the model.
99
+ // Return NULL on failure
100
+ WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
101
+ WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
102
+ WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
103
+
104
+ // Frees all memory allocated by the model.
105
+ WHISPER_API void whisper_free(struct whisper_context * ctx);
106
+
107
+ // Convert RAW PCM audio to log mel spectrogram.
108
+ // The resulting spectrogram is stored inside the provided whisper context.
109
+ // Returns 0 on success
110
+ WHISPER_API int whisper_pcm_to_mel(
111
+ struct whisper_context * ctx,
112
+ const float * samples,
113
+ int n_samples,
114
+ int n_threads);
115
+
116
+ // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
117
+ // The resulting spectrogram is stored inside the provided whisper context.
118
+ // Returns 0 on success
119
+ WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
120
+ struct whisper_context* ctx,
121
+ const float* samples,
122
+ int n_samples,
123
+ int n_threads);
124
+
125
+
126
+ // This can be used to set a custom log mel spectrogram inside the provided whisper context.
127
+ // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
128
+ // n_mel must be 80
129
+ // Returns 0 on success
130
+ WHISPER_API int whisper_set_mel(
131
+ struct whisper_context * ctx,
132
+ const float * data,
133
+ int n_len,
134
+ int n_mel);
135
+
136
+ // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
137
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
138
+ // offset can be used to specify the offset of the first frame in the spectrogram.
139
+ // Returns 0 on success
140
+ WHISPER_API int whisper_encode(
141
+ struct whisper_context * ctx,
142
+ int offset,
143
+ int n_threads);
144
+
145
+ // Run the Whisper decoder to obtain the logits and probabilities for the next token.
146
+ // Make sure to call whisper_encode() first.
147
+ // tokens + n_tokens is the provided context for the decoder.
148
+ // n_past is the number of tokens to use from previous decoder calls.
149
+ // Returns 0 on success
150
+ // TODO: add support for multiple decoders
151
+ WHISPER_API int whisper_decode(
152
+ struct whisper_context * ctx,
153
+ const whisper_token * tokens,
154
+ int n_tokens,
155
+ int n_past,
156
+ int n_threads);
157
+
158
+ // Convert the provided text into tokens.
159
+ // The tokens pointer must be large enough to hold the resulting tokens.
160
+ // Returns the number of tokens on success, no more than n_max_tokens
161
+ // Returns -1 on failure
162
+ // TODO: not sure if correct
163
+ WHISPER_API int whisper_tokenize(
164
+ struct whisper_context * ctx,
165
+ const char * text,
166
+ whisper_token * tokens,
167
+ int n_max_tokens);
168
+
169
+ // Largest language id (i.e. number of available languages - 1)
170
+ WHISPER_API int whisper_lang_max_id();
171
+
172
+ // Return the id of the specified language, returns -1 if not found
173
+ // Examples:
174
+ // "de" -> 2
175
+ // "german" -> 2
176
+ WHISPER_API int whisper_lang_id(const char * lang);
177
+
178
+ // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
179
+ WHISPER_API const char * whisper_lang_str(int id);
180
+
181
+ // Use mel data at offset_ms to try and auto-detect the spoken language
182
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
183
+ // Returns the top language id or negative on failure
184
+ // If not null, fills the lang_probs array with the probabilities of all languages
185
+ // The array must be whispe_lang_max_id() + 1 in size
186
+ // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
187
+ WHISPER_API int whisper_lang_auto_detect(
188
+ struct whisper_context * ctx,
189
+ int offset_ms,
190
+ int n_threads,
191
+ float * lang_probs);
192
+
193
+ WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
194
+ WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
195
+ WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
196
+ WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
197
+ WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
198
+
199
+ // Token logits obtained from the last call to whisper_decode()
200
+ // The logits for the last token are stored in the last row
201
+ // Rows: n_tokens
202
+ // Cols: n_vocab
203
+ WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
204
+
205
+ // Token Id -> String. Uses the vocabulary in the provided context
206
+ WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
207
+
208
+ // Special tokens
209
+ WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
210
+ WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
211
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
212
+ WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
213
+ WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
214
+ WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
215
+ WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
216
+
217
+ // Task tokens
218
+ WHISPER_API whisper_token whisper_token_translate (void);
219
+ WHISPER_API whisper_token whisper_token_transcribe(void);
220
+
221
+ // Performance information
222
+ WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
223
+ WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
224
+
225
+ // Print system information
226
+ WHISPER_API const char * whisper_print_system_info(void);
227
+
228
+ ////////////////////////////////////////////////////////////////////////////
229
+
230
+ // Available sampling strategies
231
+ enum whisper_sampling_strategy {
232
+ WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
233
+ WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
234
+ };
235
+
236
+ // Text segment callback
237
+ // Called on every newly generated text segment
238
+ // Use the whisper_full_...() functions to obtain the text segments
239
+ typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
240
+
241
+ // Encoder begin callback
242
+ // If not NULL, called before the encoder starts
243
+ // If it returns false, the computation is aborted
244
+ typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
245
+
246
+ // Logits filter callback
247
+ // Can be used to modify the logits before sampling
248
+ // If not NULL, called after applying temperature to logits
249
+ typedef void (*whisper_logits_filter_callback)(
250
+ struct whisper_context * ctx,
251
+ const whisper_token_data * tokens,
252
+ int n_tokens,
253
+ float * logits,
254
+ void * user_data);
255
+
256
+ // Parameters for the whisper_full() function
257
+ // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
258
+ // whisper_full_default_params()
259
+ struct whisper_full_params {
260
+ enum whisper_sampling_strategy strategy;
261
+
262
+ int n_threads;
263
+ int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
264
+ int offset_ms; // start offset in ms
265
+ int duration_ms; // audio duration to process in ms
266
+
267
+ bool translate;
268
+ bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
269
+ bool single_segment; // force single segment output (useful for streaming)
270
+ bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
271
+ bool print_progress; // print progress information
272
+ bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
273
+ bool print_timestamps; // print timestamps for each text segment when printing realtime
274
+
275
+ // [EXPERIMENTAL] token-level timestamps
276
+ bool token_timestamps; // enable token-level timestamps
277
+ float thold_pt; // timestamp token probability threshold (~0.01)
278
+ float thold_ptsum; // timestamp token sum probability threshold (~0.01)
279
+ int max_len; // max segment length in characters
280
+ bool split_on_word; // split on word rather than on token (when used with max_len)
281
+ int max_tokens; // max tokens per segment (0 = no limit)
282
+
283
+ // [EXPERIMENTAL] speed-up techniques
284
+ // note: these can significantly reduce the quality of the output
285
+ bool speed_up; // speed-up the audio by 2x using Phase Vocoder
286
+ int audio_ctx; // overwrite the audio context size (0 = use default)
287
+
288
+ // tokens to provide to the whisper decoder as initial prompt
289
+ // these are prepended to any existing text context from a previous call
290
+ const whisper_token * prompt_tokens;
291
+ int prompt_n_tokens;
292
+
293
+ // for auto-detection, set to nullptr, "" or "auto"
294
+ const char * language;
295
+
296
+ // common decoding parameters:
297
+ bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
298
+ bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
299
+
300
+ float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
301
+ float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
302
+ float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
303
+
304
+ // fallback parameters
305
+ // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
306
+ float temperature_inc;
307
+ float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
308
+ float logprob_thold;
309
+ float no_speech_thold; // TODO: not implemented
310
+
311
+ struct {
312
+ int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
313
+ } greedy;
314
+
315
+ struct {
316
+ int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
317
+
318
+ float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
319
+ } beam_search;
320
+
321
+ // called for every newly generated text segment
322
+ whisper_new_segment_callback new_segment_callback;
323
+ void * new_segment_callback_user_data;
324
+
325
+ // called each time before the encoder starts
326
+ whisper_encoder_begin_callback encoder_begin_callback;
327
+ void * encoder_begin_callback_user_data;
328
+
329
+ // called by each decoder to filter obtained logits
330
+ whisper_logits_filter_callback logits_filter_callback;
331
+ void * logits_filter_callback_user_data;
332
+ };
333
+
334
+ WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
335
+
336
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
337
+ // Uses the specified decoding strategy to obtain the text.
338
+ WHISPER_API int whisper_full(
339
+ struct whisper_context * ctx,
340
+ struct whisper_full_params params,
341
+ const float * samples,
342
+ int n_samples);
343
+
344
+ // Split the input audio in chunks and process each chunk separately using whisper_full()
345
+ // It seems this approach can offer some speedup in some cases.
346
+ // However, the transcription accuracy can be worse at the beginning and end of each chunk.
347
+ WHISPER_API int whisper_full_parallel(
348
+ struct whisper_context * ctx,
349
+ struct whisper_full_params params,
350
+ const float * samples,
351
+ int n_samples,
352
+ int n_processors);
353
+
354
+ // Number of generated text segments.
355
+ // A segment can be a few words, a sentence, or even a paragraph.
356
+ WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
357
+
358
+ // Language id associated with the current context
359
+ WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
360
+
361
+ // Get the start and end time of the specified segment.
362
+ WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
363
+ WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
364
+
365
+ // Get the text of the specified segment.
366
+ WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
367
+
368
+ // Get number of tokens in the specified segment.
369
+ WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
370
+
371
+ // Get the token text of the specified token in the specified segment.
372
+ WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
373
+ WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
374
+
375
+ // Get token data for the specified token in the specified segment.
376
+ // This contains probabilities, timestamps, etc.
377
+ WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
378
+
379
+ // Get the probability of the specified token in the specified segment.
380
+ WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
381
+
382
+ ////////////////////////////////////////////////////////////////////////////
383
+
384
+ // Temporary helpers needed for exposing ggml interface
385
+
386
+ WHISPER_API int whisper_bench_memcpy(int n_threads);
387
+ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
388
+
389
+ #ifdef __cplusplus
390
+ }
391
+ #endif
392
+
393
+ #endif
package/jest/mock.js CHANGED
@@ -1,4 +1,4 @@
1
- import { NativeModules } from 'react-native'
1
+ const { NativeModules } = require('react-native')
2
2
 
3
3
  if (!NativeModules.RNWhisper) {
4
4
  NativeModules.RNWhisper = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "whisper.rn",
3
- "version": "0.1.0",
3
+ "version": "0.1.3",
4
4
  "description": "React Native binding of whisper.cpp",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",
@@ -13,7 +13,7 @@
13
13
  "jest",
14
14
  "android",
15
15
  "ios",
16
- "cpp",
16
+ "cpp/*.*",
17
17
  "*.podspec",
18
18
  "!lib/typescript/example",
19
19
  "!ios/build",
@@ -40,7 +40,9 @@
40
40
  "react-native",
41
41
  "ios",
42
42
  "android",
43
+ "native module",
43
44
  "whisper",
45
+ "openai",
44
46
  "whisper.cpp",
45
47
  "speech recognition"
46
48
  ],
@@ -64,6 +66,7 @@
64
66
  "commitlint": "^17.0.2",
65
67
  "del-cli": "^5.0.0",
66
68
  "eslint": "^8.36.0",
69
+ "jest": "^29.5.0",
67
70
  "metro-react-native-babel-preset": "0.73.8",
68
71
  "pod-install": "^0.1.38",
69
72
  "react": "18.2.0",
@@ -125,8 +128,5 @@
125
128
  }
126
129
  ]
127
130
  ]
128
- },
129
- "dependencies": {
130
- "jest": "^29.5.0"
131
131
  }
132
132
  }