whispercpp 1.2.0.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (9) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +3 -92
  3. data/ext/extconf.rb +9 -0
  4. data/ext/ggml.c +18380 -5241
  5. data/ext/ggml.h +2156 -502
  6. data/ext/ruby_whisper.cpp +13 -47
  7. data/ext/whisper.cpp +4182 -1787
  8. data/ext/whisper.h +334 -65
  9. metadata +3 -3
data/ext/whisper.h CHANGED
@@ -1,10 +1,20 @@
1
1
  #ifndef WHISPER_H
2
2
  #define WHISPER_H
3
3
 
4
+ #include "ggml.h"
5
+
4
6
  #include <stddef.h>
5
7
  #include <stdint.h>
6
8
  #include <stdbool.h>
7
9
 
10
+ #ifdef __GNUC__
11
+ # define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
12
+ #elif defined(_MSC_VER)
13
+ # define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
14
+ #else
15
+ # define WHISPER_DEPRECATED(func, hint) func
16
+ #endif
17
+
8
18
  #ifdef WHISPER_SHARED
9
19
  # ifdef _WIN32
10
20
  # ifdef WHISPER_BUILD
@@ -21,7 +31,6 @@
21
31
 
22
32
  #define WHISPER_SAMPLE_RATE 16000
23
33
  #define WHISPER_N_FFT 400
24
- #define WHISPER_N_MEL 80
25
34
  #define WHISPER_HOP_LENGTH 160
26
35
  #define WHISPER_CHUNK_SIZE 30
27
36
 
@@ -41,7 +50,9 @@ extern "C" {
41
50
  //
42
51
  // ...
43
52
  //
44
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
53
+ // whisper_context_params cparams = whisper_context_default_params();
54
+ //
55
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
45
56
  //
46
57
  // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
47
58
  // fprintf(stderr, "failed to process audio\n");
@@ -66,8 +77,53 @@ extern "C" {
66
77
  //
67
78
 
68
79
  struct whisper_context;
80
+ struct whisper_state;
81
+ struct whisper_full_params;
82
+
83
+ typedef int32_t whisper_pos;
84
+ typedef int32_t whisper_token;
85
+ typedef int32_t whisper_seq_id;
86
+
87
+ enum whisper_alignment_heads_preset {
88
+ WHISPER_AHEADS_NONE,
89
+ WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
90
+ WHISPER_AHEADS_CUSTOM,
91
+ WHISPER_AHEADS_TINY_EN,
92
+ WHISPER_AHEADS_TINY,
93
+ WHISPER_AHEADS_BASE_EN,
94
+ WHISPER_AHEADS_BASE,
95
+ WHISPER_AHEADS_SMALL_EN,
96
+ WHISPER_AHEADS_SMALL,
97
+ WHISPER_AHEADS_MEDIUM_EN,
98
+ WHISPER_AHEADS_MEDIUM,
99
+ WHISPER_AHEADS_LARGE_V1,
100
+ WHISPER_AHEADS_LARGE_V2,
101
+ WHISPER_AHEADS_LARGE_V3,
102
+ };
103
+
104
+ typedef struct whisper_ahead {
105
+ int n_text_layer;
106
+ int n_head;
107
+ } whisper_ahead;
108
+
109
+ typedef struct whisper_aheads {
110
+ size_t n_heads;
111
+ const whisper_ahead * heads;
112
+ } whisper_aheads;
113
+
114
+ struct whisper_context_params {
115
+ bool use_gpu;
116
+ int gpu_device; // CUDA device
117
+
118
+ // [EXPERIMENTAL] Token-level timestamps with DTW
119
+ bool dtw_token_timestamps;
120
+ enum whisper_alignment_heads_preset dtw_aheads_preset;
69
121
 
70
- typedef int whisper_token;
122
+ int dtw_n_top;
123
+ struct whisper_aheads dtw_aheads;
124
+
125
+ size_t dtw_mem_size; // TODO: remove
126
+ };
71
127
 
72
128
  typedef struct whisper_token_data {
73
129
  whisper_token id; // token id
@@ -83,6 +139,11 @@ extern "C" {
83
139
  int64_t t0; // start time of the token
84
140
  int64_t t1; // end time of the token
85
141
 
142
+ // [EXPERIMENTAL] Token-level timestamps with DTW
143
+ // do not use if you haven't computed token-level timestamps with dtw
144
+ // Roughly corresponds to the moment in audio in which the token was output
145
+ int64_t t_dtw;
146
+
86
147
  float vlen; // voice length of the token
87
148
  } whisper_token_data;
88
149
 
@@ -94,18 +155,102 @@ extern "C" {
94
155
  void (*close)(void * ctx);
95
156
  } whisper_model_loader;
96
157
 
158
+ // grammar element type
159
+ enum whisper_gretype {
160
+ // end of rule definition
161
+ WHISPER_GRETYPE_END = 0,
162
+
163
+ // start of alternate definition for rule
164
+ WHISPER_GRETYPE_ALT = 1,
165
+
166
+ // non-terminal element: reference to rule
167
+ WHISPER_GRETYPE_RULE_REF = 2,
168
+
169
+ // terminal element: character (code point)
170
+ WHISPER_GRETYPE_CHAR = 3,
171
+
172
+ // inverse char(s) ([^a], [^a-b] [^abc])
173
+ WHISPER_GRETYPE_CHAR_NOT = 4,
174
+
175
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
176
+ // be an inclusive range ([a-z])
177
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
178
+
179
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
180
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
181
+ WHISPER_GRETYPE_CHAR_ALT = 6,
182
+ };
183
+
184
+ typedef struct whisper_grammar_element {
185
+ enum whisper_gretype type;
186
+ uint32_t value; // Unicode code point or rule ID
187
+ } whisper_grammar_element;
188
+
97
189
  // Various functions for loading a ggml whisper model.
98
190
  // Allocate (almost) all memory needed for the model.
99
191
  // Return NULL on failure
100
- WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
101
- WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
102
- WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
103
-
104
- // Frees all memory allocated by the model.
105
- WHISPER_API void whisper_free(struct whisper_context * ctx);
192
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
193
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
194
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
195
+
196
+ // These are the same as the above, but the internal state of the context is not allocated automatically
197
+ // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
198
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
199
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
200
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
201
+
202
+ WHISPER_DEPRECATED(
203
+ WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
204
+ "use whisper_init_from_file_with_params instead"
205
+ );
206
+ WHISPER_DEPRECATED(
207
+ WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
208
+ "use whisper_init_from_buffer_with_params instead"
209
+ );
210
+ WHISPER_DEPRECATED(
211
+ WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
212
+ "use whisper_init_with_params instead"
213
+ );
214
+ WHISPER_DEPRECATED(
215
+ WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
216
+ "use whisper_init_from_file_with_params_no_state instead"
217
+ );
218
+ WHISPER_DEPRECATED(
219
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
220
+ "use whisper_init_from_buffer_with_params_no_state instead"
221
+ );
222
+ WHISPER_DEPRECATED(
223
+ WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
224
+ "use whisper_init_with_params_no_state instead"
225
+ );
226
+
227
+ WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
228
+
229
+ // Given a context, enable use of OpenVINO for encode inference.
230
+ // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
231
+ // the path will be generated from the ggml model path that was passed
232
+ // in to whisper_init_from_file. For example, if 'path_model' was
233
+ // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
234
+ // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
235
+ // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
236
+ // cache_dir: Optional cache directory that can speed up init time, especially for
237
+ // GPU, by caching compiled 'blobs' there.
238
+ // Set to nullptr if not used.
239
+ // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
240
+ WHISPER_API int whisper_ctx_init_openvino_encoder(
241
+ struct whisper_context * ctx,
242
+ const char * model_path,
243
+ const char * device,
244
+ const char * cache_dir);
245
+
246
+ // Frees all allocated memory
247
+ WHISPER_API void whisper_free (struct whisper_context * ctx);
248
+ WHISPER_API void whisper_free_state(struct whisper_state * state);
249
+ WHISPER_API void whisper_free_params(struct whisper_full_params * params);
250
+ WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
106
251
 
107
252
  // Convert RAW PCM audio to log mel spectrogram.
108
- // The resulting spectrogram is stored inside the provided whisper context.
253
+ // The resulting spectrogram is stored inside the default state of the provided whisper context.
109
254
  // Returns 0 on success
110
255
  WHISPER_API int whisper_pcm_to_mel(
111
256
  struct whisper_context * ctx,
@@ -113,17 +258,30 @@ extern "C" {
113
258
  int n_samples,
114
259
  int n_threads);
115
260
 
116
- // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
117
- // The resulting spectrogram is stored inside the provided whisper context.
261
+ WHISPER_API int whisper_pcm_to_mel_with_state(
262
+ struct whisper_context * ctx,
263
+ struct whisper_state * state,
264
+ const float * samples,
265
+ int n_samples,
266
+ int n_threads);
267
+
268
+ // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
269
+ // The resulting spectrogram is stored inside the default state of the provided whisper context.
118
270
  // Returns 0 on success
119
271
  WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
120
- struct whisper_context* ctx,
121
- const float* samples,
122
- int n_samples,
123
- int n_threads);
124
-
125
-
126
- // This can be used to set a custom log mel spectrogram inside the provided whisper context.
272
+ struct whisper_context * ctx,
273
+ const float * samples,
274
+ int n_samples,
275
+ int n_threads);
276
+
277
+ WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
278
+ struct whisper_context * ctx,
279
+ struct whisper_state * state,
280
+ const float * samples,
281
+ int n_samples,
282
+ int n_threads);
283
+
284
+ // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
127
285
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
128
286
  // n_mel must be 80
129
287
  // Returns 0 on success
@@ -133,7 +291,14 @@ extern "C" {
133
291
  int n_len,
134
292
  int n_mel);
135
293
 
136
- // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
294
+ WHISPER_API int whisper_set_mel_with_state(
295
+ struct whisper_context * ctx,
296
+ struct whisper_state * state,
297
+ const float * data,
298
+ int n_len,
299
+ int n_mel);
300
+
301
+ // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
137
302
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
138
303
  // offset can be used to specify the offset of the first frame in the spectrogram.
139
304
  // Returns 0 on success
@@ -142,6 +307,12 @@ extern "C" {
142
307
  int offset,
143
308
  int n_threads);
144
309
 
310
+ WHISPER_API int whisper_encode_with_state(
311
+ struct whisper_context * ctx,
312
+ struct whisper_state * state,
313
+ int offset,
314
+ int n_threads);
315
+
145
316
  // Run the Whisper decoder to obtain the logits and probabilities for the next token.
146
317
  // Make sure to call whisper_encode() first.
147
318
  // tokens + n_tokens is the provided context for the decoder.
@@ -155,10 +326,18 @@ extern "C" {
155
326
  int n_past,
156
327
  int n_threads);
157
328
 
329
+ WHISPER_API int whisper_decode_with_state(
330
+ struct whisper_context * ctx,
331
+ struct whisper_state * state,
332
+ const whisper_token * tokens,
333
+ int n_tokens,
334
+ int n_past,
335
+ int n_threads);
336
+
158
337
  // Convert the provided text into tokens.
159
338
  // The tokens pointer must be large enough to hold the resulting tokens.
160
339
  // Returns the number of tokens on success, no more than n_max_tokens
161
- // Returns -1 on failure
340
+ // Returns a negative number on failure - the number of tokens that would have been returned
162
341
  // TODO: not sure if correct
163
342
  WHISPER_API int whisper_tokenize(
164
343
  struct whisper_context * ctx,
@@ -166,6 +345,10 @@ extern "C" {
166
345
  whisper_token * tokens,
167
346
  int n_max_tokens);
168
347
 
348
+ // Return the number of tokens in the provided text
349
+ // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
350
+ int whisper_token_count(struct whisper_context * ctx, const char * text);
351
+
169
352
  // Largest language id (i.e. number of available languages - 1)
170
353
  WHISPER_API int whisper_lang_max_id();
171
354
 
@@ -178,11 +361,14 @@ extern "C" {
178
361
  // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
179
362
  WHISPER_API const char * whisper_lang_str(int id);
180
363
 
364
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
365
+ WHISPER_API const char * whisper_lang_str_full(int id);
366
+
181
367
  // Use mel data at offset_ms to try and auto-detect the spoken language
182
368
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
183
369
  // Returns the top language id or negative on failure
184
370
  // If not null, fills the lang_probs array with the probabilities of all languages
185
- // The array must be whispe_lang_max_id() + 1 in size
371
+ // The array must be whisper_lang_max_id() + 1 in size
186
372
  // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
187
373
  WHISPER_API int whisper_lang_auto_detect(
188
374
  struct whisper_context * ctx,
@@ -190,80 +376,100 @@ extern "C" {
190
376
  int n_threads,
191
377
  float * lang_probs);
192
378
 
193
- WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
194
- WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
195
- WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
196
- WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
197
- WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
379
+ WHISPER_API int whisper_lang_auto_detect_with_state(
380
+ struct whisper_context * ctx,
381
+ struct whisper_state * state,
382
+ int offset_ms,
383
+ int n_threads,
384
+ float * lang_probs);
385
+
386
+ WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
387
+ WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
388
+ WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
389
+ WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
390
+ WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
391
+ WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
392
+
393
+ WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
394
+ WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
395
+ WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
396
+ WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
397
+ WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
398
+ WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
399
+ WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
400
+ WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
401
+ WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
402
+ WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
403
+ WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
404
+ WHISPER_API int whisper_model_type (struct whisper_context * ctx);
198
405
 
199
406
  // Token logits obtained from the last call to whisper_decode()
200
407
  // The logits for the last token are stored in the last row
201
408
  // Rows: n_tokens
202
409
  // Cols: n_vocab
203
- WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
410
+ WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
411
+ WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
204
412
 
205
413
  // Token Id -> String. Uses the vocabulary in the provided context
206
414
  WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
415
+ WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
416
+
207
417
 
208
418
  // Special tokens
209
419
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
210
420
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
211
- WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
212
421
  WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
422
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
423
+ WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
213
424
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
214
425
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
215
426
  WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
216
427
 
217
428
  // Task tokens
218
- WHISPER_API whisper_token whisper_token_translate (void);
219
- WHISPER_API whisper_token whisper_token_transcribe(void);
429
+ WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
430
+ WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
220
431
 
221
- // Performance information
432
+ // Performance information from the default state.
222
433
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
223
434
  WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
224
435
 
225
436
  // Print system information
226
437
  WHISPER_API const char * whisper_print_system_info(void);
227
438
 
228
- // Abort a running whisper_full_parallel or whisper_full
229
- WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
230
-
231
- // Resume whisper context from an aborted state allowing it run again
232
- WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
233
-
234
- // Check the whisper context state if true then it can run if false it can not
235
- WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
236
-
237
439
  ////////////////////////////////////////////////////////////////////////////
238
440
 
239
441
  // Available sampling strategies
240
442
  enum whisper_sampling_strategy {
241
- WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
443
+ WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
242
444
  WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
243
445
  };
244
446
 
245
447
  // Text segment callback
246
448
  // Called on every newly generated text segment
247
449
  // Use the whisper_full_...() functions to obtain the text segments
248
- typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
450
+ typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
451
+
452
+ // Progress callback
453
+ typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
249
454
 
250
455
  // Encoder begin callback
251
456
  // If not NULL, called before the encoder starts
252
457
  // If it returns false, the computation is aborted
253
- typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
458
+ typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
254
459
 
255
460
  // Logits filter callback
256
461
  // Can be used to modify the logits before sampling
257
462
  // If not NULL, called after applying temperature to logits
258
463
  typedef void (*whisper_logits_filter_callback)(
259
464
  struct whisper_context * ctx,
465
+ struct whisper_state * state,
260
466
  const whisper_token_data * tokens,
261
467
  int n_tokens,
262
468
  float * logits,
263
469
  void * user_data);
264
470
 
265
471
  // Parameters for the whisper_full() function
266
- // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
472
+ // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
267
473
  // whisper_full_default_params()
268
474
  struct whisper_full_params {
269
475
  enum whisper_sampling_strategy strategy;
@@ -275,6 +481,7 @@ extern "C" {
275
481
 
276
482
  bool translate;
277
483
  bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
484
+ bool no_timestamps; // do not generate timestamps
278
485
  bool single_segment; // force single segment output (useful for streaming)
279
486
  bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
280
487
  bool print_progress; // print progress information
@@ -292,15 +499,26 @@ extern "C" {
292
499
  // [EXPERIMENTAL] speed-up techniques
293
500
  // note: these can significantly reduce the quality of the output
294
501
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
502
+ bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
295
503
  int audio_ctx; // overwrite the audio context size (0 = use default)
296
504
 
505
+ // [EXPERIMENTAL] [TDRZ] tinydiarize
506
+ bool tdrz_enable; // enable tinydiarize speaker turn detection
507
+
508
+ // A regular expression that matches tokens to suppress
509
+ const char * suppress_regex;
510
+
297
511
  // tokens to provide to the whisper decoder as initial prompt
298
512
  // these are prepended to any existing text context from a previous call
513
+ // use whisper_tokenize() to convert text to tokens
514
+ // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
515
+ const char * initial_prompt;
299
516
  const whisper_token * prompt_tokens;
300
517
  int prompt_n_tokens;
301
518
 
302
519
  // for auto-detection, set to nullptr, "" or "auto"
303
520
  const char * language;
521
+ bool detect_language;
304
522
 
305
523
  // common decoding parameters:
306
524
  bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
@@ -331,18 +549,36 @@ extern "C" {
331
549
  whisper_new_segment_callback new_segment_callback;
332
550
  void * new_segment_callback_user_data;
333
551
 
552
+ // called on each progress update
553
+ whisper_progress_callback progress_callback;
554
+ void * progress_callback_user_data;
555
+
334
556
  // called each time before the encoder starts
335
557
  whisper_encoder_begin_callback encoder_begin_callback;
336
558
  void * encoder_begin_callback_user_data;
337
559
 
560
+ // called each time before ggml computation starts
561
+ ggml_abort_callback abort_callback;
562
+ void * abort_callback_user_data;
563
+
338
564
  // called by each decoder to filter obtained logits
339
565
  whisper_logits_filter_callback logits_filter_callback;
340
566
  void * logits_filter_callback_user_data;
567
+
568
+ const whisper_grammar_element ** grammar_rules;
569
+ size_t n_grammar_rules;
570
+ size_t i_start_rule;
571
+ float grammar_penalty;
341
572
  };
342
573
 
574
+ // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
575
+ WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
576
+ WHISPER_API struct whisper_context_params whisper_context_default_params(void);
577
+ WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
343
578
  WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
344
579
 
345
580
  // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
581
+ // Not thread safe for same context
346
582
  // Uses the specified decoding strategy to obtain the text.
347
583
  WHISPER_API int whisper_full(
348
584
  struct whisper_context * ctx,
@@ -350,7 +586,16 @@ extern "C" {
350
586
  const float * samples,
351
587
  int n_samples);
352
588
 
353
- // Split the input audio in chunks and process each chunk separately using whisper_full()
589
+ WHISPER_API int whisper_full_with_state(
590
+ struct whisper_context * ctx,
591
+ struct whisper_state * state,
592
+ struct whisper_full_params params,
593
+ const float * samples,
594
+ int n_samples);
595
+
596
+ // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
597
+ // Result is stored in the default state of the context
598
+ // Not thread safe if executed in parallel on the same context.
354
599
  // It seems this approach can offer some speedup in some cases.
355
600
  // However, the transcription accuracy can be worse at the beginning and end of each chunk.
356
601
  WHISPER_API int whisper_full_parallel(
@@ -360,40 +605,64 @@ extern "C" {
360
605
  int n_samples,
361
606
  int n_processors);
362
607
 
363
- // Number of generated text segments.
608
+ // Number of generated text segments
364
609
  // A segment can be a few words, a sentence, or even a paragraph.
365
- WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
610
+ WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
611
+ WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
366
612
 
367
- // Language id associated with the current context
613
+ // Language id associated with the context's default state
368
614
  WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
369
615
 
370
- // Get the start and end time of the specified segment.
371
- WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
372
- WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
616
+ // Language id associated with the provided state
617
+ WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
618
+
619
+ // Get the start and end time of the specified segment
620
+ WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
621
+ WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
622
+
623
+ WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
624
+ WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
373
625
 
374
- // Get the text of the specified segment.
375
- WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
626
+ // Get whether the next segment is predicted as a speaker turn
627
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
628
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
376
629
 
377
- // Get number of tokens in the specified segment.
378
- WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
630
+ // Get the text of the specified segment
631
+ WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
632
+ WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
379
633
 
380
- // Get the token text of the specified token in the specified segment.
381
- WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
382
- WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
634
+ // Get number of tokens in the specified segment
635
+ WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
636
+ WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
383
637
 
384
- // Get token data for the specified token in the specified segment.
638
+ // Get the token text of the specified token in the specified segment
639
+ WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
640
+ WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
641
+
642
+ WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
643
+ WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
644
+
645
+ // Get token data for the specified token in the specified segment
385
646
  // This contains probabilities, timestamps, etc.
386
- WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
647
+ WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
648
+ WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
387
649
 
388
- // Get the probability of the specified token in the specified segment.
389
- WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
650
+ // Get the probability of the specified token in the specified segment
651
+ WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
652
+ WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
390
653
 
391
654
  ////////////////////////////////////////////////////////////////////////////
392
655
 
393
656
  // Temporary helpers needed for exposing ggml interface
394
657
 
395
- WHISPER_API int whisper_bench_memcpy(int n_threads);
396
- WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
658
+ WHISPER_API int whisper_bench_memcpy (int n_threads);
659
+ WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
660
+ WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
661
+ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
662
+
663
+ // Control logging output; default behavior is to print to stderr
664
+
665
+ WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
397
666
 
398
667
  #ifdef __cplusplus
399
668
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whispercpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0.2
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Georgi Gerganov
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2023-02-27 00:00:00.000000000 Z
12
+ date: 2024-05-14 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: High-performance inference of OpenAI's Whisper automatic speech recognition
15
15
  (ASR) model via Ruby
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
55
55
  - !ruby/object:Gem::Version
56
56
  version: '0'
57
57
  requirements: []
58
- rubygems_version: 3.2.33
58
+ rubygems_version: 3.5.9
59
59
  signing_key:
60
60
  specification_version: 4
61
61
  summary: Ruby whisper.cpp bindings