whisper.rn 0.4.0-rc.0 → 0.4.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +6 -6
  2. package/android/build.gradle +4 -0
  3. package/android/src/main/CMakeLists.txt +14 -0
  4. package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
  5. package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
  6. package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
  7. package/android/src/main/jni-utils.h +76 -0
  8. package/android/src/main/jni.cpp +226 -109
  9. package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  10. package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
  11. package/cpp/README.md +1 -1
  12. package/cpp/coreml/whisper-encoder-impl.h +1 -1
  13. package/cpp/coreml/whisper-encoder.h +4 -0
  14. package/cpp/coreml/whisper-encoder.mm +5 -3
  15. package/cpp/ggml-aarch64.c +129 -0
  16. package/cpp/ggml-aarch64.h +19 -0
  17. package/cpp/ggml-alloc.c +805 -400
  18. package/cpp/ggml-alloc.h +60 -10
  19. package/cpp/ggml-backend-impl.h +216 -0
  20. package/cpp/ggml-backend-reg.cpp +204 -0
  21. package/cpp/ggml-backend.cpp +1996 -0
  22. package/cpp/ggml-backend.cpp.rej +12 -0
  23. package/cpp/ggml-backend.h +336 -0
  24. package/cpp/ggml-common.h +1853 -0
  25. package/cpp/ggml-cpp.h +38 -0
  26. package/cpp/ggml-cpu-aarch64.c +3560 -0
  27. package/cpp/ggml-cpu-aarch64.h +30 -0
  28. package/cpp/ggml-cpu-impl.h +371 -0
  29. package/cpp/ggml-cpu-quants.c +10822 -0
  30. package/cpp/ggml-cpu-quants.h +63 -0
  31. package/cpp/ggml-cpu.c +13970 -0
  32. package/cpp/ggml-cpu.cpp +663 -0
  33. package/cpp/ggml-cpu.h +177 -0
  34. package/cpp/ggml-impl.h +551 -0
  35. package/cpp/ggml-metal-impl.h +249 -0
  36. package/cpp/ggml-metal.h +24 -43
  37. package/cpp/ggml-metal.m +4190 -1075
  38. package/cpp/ggml-quants.c +5247 -0
  39. package/cpp/ggml-quants.h +100 -0
  40. package/cpp/ggml-threading.cpp +12 -0
  41. package/cpp/ggml-threading.h +12 -0
  42. package/cpp/ggml-whisper.metallib +0 -0
  43. package/cpp/ggml.c +5474 -18763
  44. package/cpp/ggml.h +833 -628
  45. package/cpp/rn-audioutils.cpp +68 -0
  46. package/cpp/rn-audioutils.h +14 -0
  47. package/cpp/rn-whisper-log.h +11 -0
  48. package/cpp/rn-whisper.cpp +221 -52
  49. package/cpp/rn-whisper.h +50 -15
  50. package/cpp/whisper.cpp +2863 -1340
  51. package/cpp/whisper.h +170 -38
  52. package/ios/RNWhisper.mm +141 -46
  53. package/ios/RNWhisperAudioUtils.h +1 -2
  54. package/ios/RNWhisperAudioUtils.m +18 -67
  55. package/ios/RNWhisperContext.h +11 -8
  56. package/ios/RNWhisperContext.mm +197 -144
  57. package/jest/mock.js +15 -2
  58. package/lib/commonjs/NativeRNWhisper.js.map +1 -1
  59. package/lib/commonjs/index.js +78 -28
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/commonjs/version.json +1 -1
  62. package/lib/module/NativeRNWhisper.js.map +1 -1
  63. package/lib/module/index.js +78 -28
  64. package/lib/module/index.js.map +1 -1
  65. package/lib/module/version.json +1 -1
  66. package/lib/typescript/NativeRNWhisper.d.ts +14 -4
  67. package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
  68. package/lib/typescript/index.d.ts +39 -5
  69. package/lib/typescript/index.d.ts.map +1 -1
  70. package/package.json +9 -7
  71. package/src/NativeRNWhisper.ts +21 -4
  72. package/src/index.ts +102 -42
  73. package/src/version.json +1 -1
  74. package/whisper-rn.podspec +11 -18
  75. package/cpp/ggml-metal.metal +0 -2353
  76. package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
  77. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
  78. package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  79. package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19
package/cpp/whisper.h CHANGED
@@ -1,10 +1,21 @@
1
1
  #ifndef WHISPER_H
2
2
  #define WHISPER_H
3
3
 
4
+ #include "ggml.h"
5
+ #include "ggml-cpu.h"
6
+
4
7
  #include <stddef.h>
5
8
  #include <stdint.h>
6
9
  #include <stdbool.h>
7
10
 
11
+ #ifdef __GNUC__
12
+ # define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
13
+ #elif defined(_MSC_VER)
14
+ # define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
15
+ #else
16
+ # define WHISPER_DEPRECATED(func, hint) func
17
+ #endif
18
+
8
19
  #ifdef WHISPER_SHARED
9
20
  # ifdef _WIN32
10
21
  # ifdef WHISPER_BUILD
@@ -21,7 +32,6 @@
21
32
 
22
33
  #define WHISPER_SAMPLE_RATE 16000
23
34
  #define WHISPER_N_FFT 400
24
- #define WHISPER_N_MEL 80
25
35
  #define WHISPER_HOP_LENGTH 160
26
36
  #define WHISPER_CHUNK_SIZE 30
27
37
 
@@ -41,7 +51,9 @@ extern "C" {
41
51
  //
42
52
  // ...
43
53
  //
44
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
54
+ // whisper_context_params cparams = whisper_context_default_params();
55
+ //
56
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
45
57
  //
46
58
  // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
47
59
  // fprintf(stderr, "failed to process audio\n");
@@ -69,7 +81,53 @@ extern "C" {
69
81
  struct whisper_state;
70
82
  struct whisper_full_params;
71
83
 
72
- typedef int whisper_token;
84
+ typedef int32_t whisper_pos;
85
+ typedef int32_t whisper_token;
86
+ typedef int32_t whisper_seq_id;
87
+
88
+ enum whisper_alignment_heads_preset {
89
+ WHISPER_AHEADS_NONE,
90
+ WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
91
+ WHISPER_AHEADS_CUSTOM,
92
+ WHISPER_AHEADS_TINY_EN,
93
+ WHISPER_AHEADS_TINY,
94
+ WHISPER_AHEADS_BASE_EN,
95
+ WHISPER_AHEADS_BASE,
96
+ WHISPER_AHEADS_SMALL_EN,
97
+ WHISPER_AHEADS_SMALL,
98
+ WHISPER_AHEADS_MEDIUM_EN,
99
+ WHISPER_AHEADS_MEDIUM,
100
+ WHISPER_AHEADS_LARGE_V1,
101
+ WHISPER_AHEADS_LARGE_V2,
102
+ WHISPER_AHEADS_LARGE_V3,
103
+ WHISPER_AHEADS_LARGE_V3_TURBO,
104
+ };
105
+
106
+ typedef struct whisper_ahead {
107
+ int n_text_layer;
108
+ int n_head;
109
+ } whisper_ahead;
110
+
111
+ typedef struct whisper_aheads {
112
+ size_t n_heads;
113
+ const whisper_ahead * heads;
114
+ } whisper_aheads;
115
+
116
+ struct whisper_context_params {
117
+ bool use_gpu;
118
+ bool use_coreml;
119
+ bool flash_attn;
120
+ int gpu_device; // CUDA device
121
+
122
+ // [EXPERIMENTAL] Token-level timestamps with DTW
123
+ bool dtw_token_timestamps;
124
+ enum whisper_alignment_heads_preset dtw_aheads_preset;
125
+
126
+ int dtw_n_top;
127
+ struct whisper_aheads dtw_aheads;
128
+
129
+ size_t dtw_mem_size; // TODO: remove
130
+ };
73
131
 
74
132
  typedef struct whisper_token_data {
75
133
  whisper_token id; // token id
@@ -85,6 +143,11 @@ extern "C" {
85
143
  int64_t t0; // start time of the token
86
144
  int64_t t1; // end time of the token
87
145
 
146
+ // [EXPERIMENTAL] Token-level timestamps with DTW
147
+ // do not use if you haven't computed token-level timestamps with dtw
148
+ // Roughly corresponds to the moment in audio in which the token was output
149
+ int64_t t_dtw;
150
+
88
151
  float vlen; // voice length of the token
89
152
  } whisper_token_data;
90
153
 
@@ -96,18 +159,74 @@ extern "C" {
96
159
  void (*close)(void * ctx);
97
160
  } whisper_model_loader;
98
161
 
162
+ // grammar element type
163
+ enum whisper_gretype {
164
+ // end of rule definition
165
+ WHISPER_GRETYPE_END = 0,
166
+
167
+ // start of alternate definition for rule
168
+ WHISPER_GRETYPE_ALT = 1,
169
+
170
+ // non-terminal element: reference to rule
171
+ WHISPER_GRETYPE_RULE_REF = 2,
172
+
173
+ // terminal element: character (code point)
174
+ WHISPER_GRETYPE_CHAR = 3,
175
+
176
+ // inverse char(s) ([^a], [^a-b] [^abc])
177
+ WHISPER_GRETYPE_CHAR_NOT = 4,
178
+
179
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
180
+ // be an inclusive range ([a-z])
181
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
182
+
183
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
184
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
185
+ WHISPER_GRETYPE_CHAR_ALT = 6,
186
+ };
187
+
188
+ typedef struct whisper_grammar_element {
189
+ enum whisper_gretype type;
190
+ uint32_t value; // Unicode code point or rule ID
191
+ } whisper_grammar_element;
192
+
99
193
  // Various functions for loading a ggml whisper model.
100
194
  // Allocate (almost) all memory needed for the model.
101
195
  // Return NULL on failure
102
- WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
103
- WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
104
- WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
196
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
197
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
198
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
105
199
 
106
200
  // These are the same as the above, but the internal state of the context is not allocated automatically
107
201
  // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
108
- WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
109
- WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
110
- WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
202
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
203
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
204
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
205
+
206
+ WHISPER_DEPRECATED(
207
+ WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
208
+ "use whisper_init_from_file_with_params instead"
209
+ );
210
+ WHISPER_DEPRECATED(
211
+ WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
212
+ "use whisper_init_from_buffer_with_params instead"
213
+ );
214
+ WHISPER_DEPRECATED(
215
+ WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
216
+ "use whisper_init_with_params instead"
217
+ );
218
+ WHISPER_DEPRECATED(
219
+ WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
220
+ "use whisper_init_from_file_with_params_no_state instead"
221
+ );
222
+ WHISPER_DEPRECATED(
223
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
224
+ "use whisper_init_from_buffer_with_params_no_state instead"
225
+ );
226
+ WHISPER_DEPRECATED(
227
+ WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
228
+ "use whisper_init_with_params_no_state instead"
229
+ );
111
230
 
112
231
  WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
113
232
 
@@ -122,6 +241,13 @@ extern "C" {
122
241
  // GPU, by caching compiled 'blobs' there.
123
242
  // Set to nullptr if not used.
124
243
  // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
244
+ WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
245
+ struct whisper_context * ctx,
246
+ struct whisper_state * state,
247
+ const char * model_path,
248
+ const char * device,
249
+ const char * cache_dir);
250
+
125
251
  WHISPER_API int whisper_ctx_init_openvino_encoder(
126
252
  struct whisper_context * ctx,
127
253
  const char * model_path,
@@ -132,6 +258,7 @@ extern "C" {
132
258
  WHISPER_API void whisper_free (struct whisper_context * ctx);
133
259
  WHISPER_API void whisper_free_state(struct whisper_state * state);
134
260
  WHISPER_API void whisper_free_params(struct whisper_full_params * params);
261
+ WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
135
262
 
136
263
  // Convert RAW PCM audio to log mel spectrogram.
137
264
  // The resulting spectrogram is stored inside the default state of the provided whisper context.
@@ -149,22 +276,6 @@ extern "C" {
149
276
  int n_samples,
150
277
  int n_threads);
151
278
 
152
- // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
153
- // The resulting spectrogram is stored inside the default state of the provided whisper context.
154
- // Returns 0 on success
155
- WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
156
- struct whisper_context * ctx,
157
- const float * samples,
158
- int n_samples,
159
- int n_threads);
160
-
161
- WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
162
- struct whisper_context * ctx,
163
- struct whisper_state * state,
164
- const float * samples,
165
- int n_samples,
166
- int n_threads);
167
-
168
279
  // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
169
280
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
170
281
  // n_mel must be 80
@@ -221,7 +332,7 @@ extern "C" {
221
332
  // Convert the provided text into tokens.
222
333
  // The tokens pointer must be large enough to hold the resulting tokens.
223
334
  // Returns the number of tokens on success, no more than n_max_tokens
224
- // Returns -1 on failure
335
+ // Returns a negative number on failure - the number of tokens that would have been returned
225
336
  // TODO: not sure if correct
226
337
  WHISPER_API int whisper_tokenize(
227
338
  struct whisper_context * ctx,
@@ -229,8 +340,12 @@ extern "C" {
229
340
  whisper_token * tokens,
230
341
  int n_max_tokens);
231
342
 
343
+ // Return the number of tokens in the provided text
344
+ // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
345
+ int whisper_token_count(struct whisper_context * ctx, const char * text);
346
+
232
347
  // Largest language id (i.e. number of available languages - 1)
233
- WHISPER_API int whisper_lang_max_id();
348
+ WHISPER_API int whisper_lang_max_id(void);
234
349
 
235
350
  // Return the id of the specified language, returns -1 if not found
236
351
  // Examples:
@@ -241,6 +356,9 @@ extern "C" {
241
356
  // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
242
357
  WHISPER_API const char * whisper_lang_str(int id);
243
358
 
359
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
360
+ WHISPER_API const char * whisper_lang_str_full(int id);
361
+
244
362
  // Use mel data at offset_ms to try and auto-detect the spoken language
245
363
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
246
364
  // Returns the top language id or negative on failure
@@ -307,6 +425,14 @@ extern "C" {
307
425
  WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
308
426
 
309
427
  // Performance information from the default state.
428
+ struct whisper_timings {
429
+ float sample_ms;
430
+ float encode_ms;
431
+ float decode_ms;
432
+ float batchd_ms;
433
+ float prompt_ms;
434
+ };
435
+ WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
310
436
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
311
437
  WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
312
438
 
@@ -334,11 +460,6 @@ extern "C" {
334
460
  // If it returns false, the computation is aborted
335
461
  typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
336
462
 
337
- // Abort callback
338
- // If not NULL, called before ggml computation
339
- // If it returns true, the computation is aborted
340
- typedef bool (*whisper_abort_callback)(void * user_data);
341
-
342
463
  // Logits filter callback
343
464
  // Can be used to modify the logits before sampling
344
465
  // If not NULL, called after applying temperature to logits
@@ -363,6 +484,7 @@ extern "C" {
363
484
 
364
485
  bool translate;
365
486
  bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
487
+ bool no_timestamps; // do not generate timestamps
366
488
  bool single_segment; // force single segment output (useful for streaming)
367
489
  bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
368
490
  bool print_progress; // print progress information
@@ -379,15 +501,19 @@ extern "C" {
379
501
 
380
502
  // [EXPERIMENTAL] speed-up techniques
381
503
  // note: these can significantly reduce the quality of the output
382
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
383
504
  bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
384
505
  int audio_ctx; // overwrite the audio context size (0 = use default)
385
506
 
386
507
  // [EXPERIMENTAL] [TDRZ] tinydiarize
387
508
  bool tdrz_enable; // enable tinydiarize speaker turn detection
388
509
 
510
+ // A regular expression that matches tokens to suppress
511
+ const char * suppress_regex;
512
+
389
513
  // tokens to provide to the whisper decoder as initial prompt
390
514
  // these are prepended to any existing text context from a previous call
515
+ // use whisper_tokenize() to convert text to tokens
516
+ // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
391
517
  const char * initial_prompt;
392
518
  const whisper_token * prompt_tokens;
393
519
  int prompt_n_tokens;
@@ -434,17 +560,24 @@ extern "C" {
434
560
  void * encoder_begin_callback_user_data;
435
561
 
436
562
  // called each time before ggml computation starts
437
- whisper_abort_callback abort_callback;
563
+ wsp_ggml_abort_callback abort_callback;
438
564
  void * abort_callback_user_data;
439
565
 
440
566
  // called by each decoder to filter obtained logits
441
567
  whisper_logits_filter_callback logits_filter_callback;
442
568
  void * logits_filter_callback_user_data;
569
+
570
+ const whisper_grammar_element ** grammar_rules;
571
+ size_t n_grammar_rules;
572
+ size_t i_start_rule;
573
+ float grammar_penalty;
443
574
  };
444
575
 
445
- // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
576
+ // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
577
+ WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
578
+ WHISPER_API struct whisper_context_params whisper_context_default_params (void);
446
579
  WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
447
- WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
580
+ WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
448
581
 
449
582
  // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
450
583
  // Not thread safe for same context
@@ -531,8 +664,7 @@ extern "C" {
531
664
 
532
665
  // Control logging output; default behavior is to print to stderr
533
666
 
534
- typedef void (*whisper_log_callback)(const char * line);
535
- WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
667
+ WHISPER_API void whisper_log_set(wsp_ggml_log_callback log_callback, void * user_data);
536
668
 
537
669
  #ifdef __cplusplus
538
670
  }
package/ios/RNWhisper.mm CHANGED
@@ -48,6 +48,9 @@ RCT_REMAP_METHOD(initContext,
48
48
 
49
49
  NSString *modelPath = [modelOptions objectForKey:@"filePath"];
50
50
  BOOL isBundleAsset = [[modelOptions objectForKey:@"isBundleAsset"] boolValue];
51
+ BOOL useGpu = [[modelOptions objectForKey:@"useGpu"] boolValue];
52
+ BOOL useCoreMLIos = [[modelOptions objectForKey:@"useCoreMLIos"] boolValue];
53
+ BOOL useFlashAttn = [[modelOptions objectForKey:@"useFlashAttn"] boolValue];
51
54
 
52
55
  // For support debug assets in development mode
53
56
  BOOL downloadCoreMLAssets = [[modelOptions objectForKey:@"downloadCoreMLAssets"] boolValue];
@@ -75,6 +78,9 @@ RCT_REMAP_METHOD(initContext,
75
78
  RNWhisperContext *context = [RNWhisperContext
76
79
  initWithModelPath:path
77
80
  contextId:contextId
81
+ noCoreML:!useCoreMLIos
82
+ noMetal:!useGpu
83
+ useFlashAttn:useFlashAttn
78
84
  ];
79
85
  if ([context getContext] == NULL) {
80
86
  reject(@"whisper_cpp_error", @"Failed to load the model", nil);
@@ -83,7 +89,11 @@ RCT_REMAP_METHOD(initContext,
83
89
 
84
90
  [contexts setObject:context forKey:[NSNumber numberWithInt:contextId]];
85
91
 
86
- resolve([NSNumber numberWithInt:contextId]);
92
+ resolve(@{
93
+ @"contextId": @(contextId),
94
+ @"gpu": @([context isMetalEnabled]),
95
+ @"reasonNoGPU": [context reasonNoMetal],
96
+ });
87
97
  }
88
98
 
89
99
  - (NSArray *)supportedEvents {
@@ -95,48 +105,23 @@ RCT_REMAP_METHOD(initContext,
95
105
  ];
96
106
  }
97
107
 
98
- RCT_REMAP_METHOD(transcribeFile,
99
- withContextId:(int)contextId
100
- withJobId:(int)jobId
101
- withWaveFile:(NSString *)waveFilePath
102
- withOptions:(NSDictionary *)options
103
- withResolver:(RCTPromiseResolveBlock)resolve
104
- withRejecter:(RCTPromiseRejectBlock)reject)
108
+ - (void)transcribeData:(RNWhisperContext *)context
109
+ withContextId:(int)contextId
110
+ withJobId:(int)jobId
111
+ withData:(float *)data
112
+ withDataCount:(int)count
113
+ withOptions:(NSDictionary *)options
114
+ withResolver:(RCTPromiseResolveBlock)resolve
115
+ withRejecter:(RCTPromiseRejectBlock)reject
105
116
  {
106
- RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
107
-
108
- if (context == nil) {
109
- reject(@"whisper_error", @"Context not found", nil);
110
- return;
111
- }
112
- if ([context isCapturing]) {
113
- reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
114
- return;
115
- }
116
- if ([context isTranscribing]) {
117
- reject(@"whisper_error", @"Context is already transcribing", nil);
118
- return;
119
- }
120
-
121
- NSString *path = waveFilePath;
122
- if ([path hasPrefix:@"http://"] || [path hasPrefix:@"https://"]) {
123
- path = [RNWhisperDownloader downloadFile:path toFile:nil];
124
- }
125
-
126
- int count = 0;
127
- float *waveFile = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
128
- if (waveFile == nil) {
129
- reject(@"whisper_error", @"Invalid file", nil);
130
- return;
131
- }
132
- [context transcribeFile:jobId
133
- audioData:waveFile
117
+ [context transcribeData:jobId
118
+ audioData:data
134
119
  audioDataCount:count
135
120
  options:options
136
121
  onProgress: ^(int progress) {
137
- if (rn_whisper_transcribe_is_aborted(jobId)) {
138
- return;
139
- }
122
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
123
+ if (job && job->is_aborted()) return;
124
+
140
125
  dispatch_async(dispatch_get_main_queue(), ^{
141
126
  [self sendEventWithName:@"@RNWhisper_onTranscribeProgress"
142
127
  body:@{
@@ -148,9 +133,9 @@ RCT_REMAP_METHOD(transcribeFile,
148
133
  });
149
134
  }
150
135
  onNewSegments: ^(NSDictionary *result) {
151
- if (rn_whisper_transcribe_is_aborted(jobId)) {
152
- return;
153
- }
136
+ rnwhisper::job* job = rnwhisper::job_get(jobId);
137
+ if (job && job->is_aborted()) return;
138
+
154
139
  dispatch_async(dispatch_get_main_queue(), ^{
155
140
  [self sendEventWithName:@"@RNWhisper_onTranscribeNewSegments"
156
141
  body:@{
@@ -162,12 +147,10 @@ RCT_REMAP_METHOD(transcribeFile,
162
147
  });
163
148
  }
164
149
  onEnd: ^(int code) {
165
- if (code != 0) {
166
- free(waveFile);
150
+ if (code != 0 && code != 999) {
167
151
  reject(@"whisper_cpp_error", [NSString stringWithFormat:@"Failed to transcribe the file. Code: %d", code], nil);
168
152
  return;
169
153
  }
170
- free(waveFile);
171
154
  NSMutableDictionary *result = [context getTextSegments];
172
155
  result[@"isAborted"] = @([context isStoppedByAction]);
173
156
  resolve(result);
@@ -175,6 +158,99 @@ RCT_REMAP_METHOD(transcribeFile,
175
158
  ];
176
159
  }
177
160
 
161
+ RCT_REMAP_METHOD(transcribeFile,
162
+ withContextId:(int)contextId
163
+ withJobId:(int)jobId
164
+ withWaveFile:(NSString *)waveFilePathOrDataBase64
165
+ withOptions:(NSDictionary *)options
166
+ withResolver:(RCTPromiseResolveBlock)resolve
167
+ withRejecter:(RCTPromiseRejectBlock)reject)
168
+ {
169
+ RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
170
+
171
+ if (context == nil) {
172
+ reject(@"whisper_error", @"Context not found", nil);
173
+ return;
174
+ }
175
+ if ([context isCapturing]) {
176
+ reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
177
+ return;
178
+ }
179
+ if ([context isTranscribing]) {
180
+ reject(@"whisper_error", @"Context is already transcribing", nil);
181
+ return;
182
+ }
183
+
184
+ float *data = nil;
185
+ int count = 0;
186
+ if ([waveFilePathOrDataBase64 hasPrefix:@"http://"] || [waveFilePathOrDataBase64 hasPrefix:@"https://"]) {
187
+ NSString *path = [RNWhisperDownloader downloadFile:waveFilePathOrDataBase64 toFile:nil];
188
+ data = [RNWhisperAudioUtils decodeWaveFile:path count:&count];
189
+ } else if ([waveFilePathOrDataBase64 hasPrefix:@"data:audio/wav;base64,"]) {
190
+ NSData *waveData = [[NSData alloc] initWithBase64EncodedString:[waveFilePathOrDataBase64 substringFromIndex:22] options:0];
191
+ data = [RNWhisperAudioUtils decodeWaveData:waveData count:&count cutHeader:YES];
192
+ } else {
193
+ data = [RNWhisperAudioUtils decodeWaveFile:waveFilePathOrDataBase64 count:&count];
194
+ }
195
+ if (data == nil) {
196
+ reject(@"whisper_error", @"Invalid file", nil);
197
+ return;
198
+ }
199
+
200
+ [self transcribeData:context
201
+ withContextId:contextId
202
+ withJobId:jobId
203
+ withData:data
204
+ withDataCount:count
205
+ withOptions:options
206
+ withResolver:resolve
207
+ withRejecter:reject
208
+ ];
209
+ }
210
+
211
+ RCT_REMAP_METHOD(transcribeData,
212
+ withContextId:(int)contextId
213
+ withJobId:(int)jobId
214
+ withData:(NSString *)dataBase64 // pcm data base64 encoded
215
+ withOptions:(NSDictionary *)options
216
+ withResolver:(RCTPromiseResolveBlock)resolve
217
+ withRejecter:(RCTPromiseRejectBlock)reject)
218
+ {
219
+ RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
220
+
221
+ if (context == nil) {
222
+ reject(@"whisper_error", @"Context not found", nil);
223
+ return;
224
+ }
225
+ if ([context isCapturing]) {
226
+ reject(@"whisper_error", @"The context is in realtime transcribe mode", nil);
227
+ return;
228
+ }
229
+ if ([context isTranscribing]) {
230
+ reject(@"whisper_error", @"Context is already transcribing", nil);
231
+ return;
232
+ }
233
+
234
+ NSData *pcmData = [[NSData alloc] initWithBase64EncodedString:dataBase64 options:0];
235
+ int count = 0;
236
+ float *data = [RNWhisperAudioUtils decodeWaveData:pcmData count:&count cutHeader:NO];
237
+
238
+ if (data == nil) {
239
+ reject(@"whisper_error", @"Invalid data", nil);
240
+ return;
241
+ }
242
+
243
+ [self transcribeData:context
244
+ withContextId:contextId
245
+ withJobId:jobId
246
+ withData:data
247
+ withDataCount:count
248
+ withOptions:options
249
+ withResolver:resolve
250
+ withRejecter:reject
251
+ ];
252
+ }
253
+
178
254
  RCT_REMAP_METHOD(startRealtimeTranscribe,
179
255
  withContextId:(int)contextId
180
256
  withJobId:(int)jobId
@@ -236,6 +312,25 @@ RCT_REMAP_METHOD(abortTranscribe,
236
312
  resolve(nil);
237
313
  }
238
314
 
315
+ RCT_REMAP_METHOD(bench,
316
+ withContextId:(int)contextId
317
+ withMaxThreads:(int)maxThreads
318
+ withResolver:(RCTPromiseResolveBlock)resolve
319
+ withRejecter:(RCTPromiseRejectBlock)reject)
320
+ {
321
+ RNWhisperContext *context = contexts[[NSNumber numberWithInt:contextId]];
322
+ if (context == nil) {
323
+ reject(@"whisper_error", @"Context not found", nil);
324
+ return;
325
+ }
326
+ if ([context isTranscribing]) {
327
+ reject(@"whisper_error", @"The context is transcribing", nil);
328
+ return;
329
+ }
330
+ NSString *result = [context bench:maxThreads];
331
+ resolve(result);
332
+ }
333
+
239
334
  RCT_REMAP_METHOD(releaseContext,
240
335
  withContextId:(int)contextId
241
336
  withResolver:(RCTPromiseResolveBlock)resolve
@@ -271,7 +366,7 @@ RCT_REMAP_METHOD(releaseAllContexts,
271
366
  [context invalidate];
272
367
  }
273
368
 
274
- rn_whisper_abort_all_transcribe(); // graceful abort
369
+ rnwhisper::job_abort_all(); // graceful abort
275
370
 
276
371
  [contexts removeAllObjects];
277
372
  contexts = nil;
@@ -2,8 +2,7 @@
2
2
 
3
3
  @interface RNWhisperAudioUtils : NSObject
4
4
 
5
- + (NSData *)concatShortBuffers:(NSMutableArray<NSValue *> *)buffers sliceNSamples:(NSMutableArray<NSNumber *> *)sliceNSamples;
6
- + (void)saveWavFile:(NSData *)rawData audioOutputFile:(NSString *)audioOutputFile;
5
+ + (float *)decodeWaveData:(NSData*)data count:(int *)count cutHeader:(BOOL)cutHeader;
7
6
  + (float *)decodeWaveFile:(NSString*)filePath count:(int *)count;
8
7
 
9
8
  @end