whispercpp 1.2.0.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -92
- data/ext/extconf.rb +9 -0
- data/ext/ggml.c +18380 -5241
- data/ext/ggml.h +2156 -502
- data/ext/ruby_whisper.cpp +13 -47
- data/ext/whisper.cpp +4182 -1787
- data/ext/whisper.h +334 -65
- metadata +3 -3
data/ext/whisper.h
CHANGED
@@ -1,10 +1,20 @@
|
|
1
1
|
#ifndef WHISPER_H
|
2
2
|
#define WHISPER_H
|
3
3
|
|
4
|
+
#include "ggml.h"
|
5
|
+
|
4
6
|
#include <stddef.h>
|
5
7
|
#include <stdint.h>
|
6
8
|
#include <stdbool.h>
|
7
9
|
|
10
|
+
#ifdef __GNUC__
|
11
|
+
# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
12
|
+
#elif defined(_MSC_VER)
|
13
|
+
# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
14
|
+
#else
|
15
|
+
# define WHISPER_DEPRECATED(func, hint) func
|
16
|
+
#endif
|
17
|
+
|
8
18
|
#ifdef WHISPER_SHARED
|
9
19
|
# ifdef _WIN32
|
10
20
|
# ifdef WHISPER_BUILD
|
@@ -21,7 +31,6 @@
|
|
21
31
|
|
22
32
|
#define WHISPER_SAMPLE_RATE 16000
|
23
33
|
#define WHISPER_N_FFT 400
|
24
|
-
#define WHISPER_N_MEL 80
|
25
34
|
#define WHISPER_HOP_LENGTH 160
|
26
35
|
#define WHISPER_CHUNK_SIZE 30
|
27
36
|
|
@@ -41,7 +50,9 @@ extern "C" {
|
|
41
50
|
//
|
42
51
|
// ...
|
43
52
|
//
|
44
|
-
//
|
53
|
+
// whisper_context_params cparams = whisper_context_default_params();
|
54
|
+
//
|
55
|
+
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
45
56
|
//
|
46
57
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
47
58
|
// fprintf(stderr, "failed to process audio\n");
|
@@ -66,8 +77,53 @@ extern "C" {
|
|
66
77
|
//
|
67
78
|
|
68
79
|
struct whisper_context;
|
80
|
+
struct whisper_state;
|
81
|
+
struct whisper_full_params;
|
82
|
+
|
83
|
+
typedef int32_t whisper_pos;
|
84
|
+
typedef int32_t whisper_token;
|
85
|
+
typedef int32_t whisper_seq_id;
|
86
|
+
|
87
|
+
enum whisper_alignment_heads_preset {
|
88
|
+
WHISPER_AHEADS_NONE,
|
89
|
+
WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
|
90
|
+
WHISPER_AHEADS_CUSTOM,
|
91
|
+
WHISPER_AHEADS_TINY_EN,
|
92
|
+
WHISPER_AHEADS_TINY,
|
93
|
+
WHISPER_AHEADS_BASE_EN,
|
94
|
+
WHISPER_AHEADS_BASE,
|
95
|
+
WHISPER_AHEADS_SMALL_EN,
|
96
|
+
WHISPER_AHEADS_SMALL,
|
97
|
+
WHISPER_AHEADS_MEDIUM_EN,
|
98
|
+
WHISPER_AHEADS_MEDIUM,
|
99
|
+
WHISPER_AHEADS_LARGE_V1,
|
100
|
+
WHISPER_AHEADS_LARGE_V2,
|
101
|
+
WHISPER_AHEADS_LARGE_V3,
|
102
|
+
};
|
103
|
+
|
104
|
+
typedef struct whisper_ahead {
|
105
|
+
int n_text_layer;
|
106
|
+
int n_head;
|
107
|
+
} whisper_ahead;
|
108
|
+
|
109
|
+
typedef struct whisper_aheads {
|
110
|
+
size_t n_heads;
|
111
|
+
const whisper_ahead * heads;
|
112
|
+
} whisper_aheads;
|
113
|
+
|
114
|
+
struct whisper_context_params {
|
115
|
+
bool use_gpu;
|
116
|
+
int gpu_device; // CUDA device
|
117
|
+
|
118
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
119
|
+
bool dtw_token_timestamps;
|
120
|
+
enum whisper_alignment_heads_preset dtw_aheads_preset;
|
69
121
|
|
70
|
-
|
122
|
+
int dtw_n_top;
|
123
|
+
struct whisper_aheads dtw_aheads;
|
124
|
+
|
125
|
+
size_t dtw_mem_size; // TODO: remove
|
126
|
+
};
|
71
127
|
|
72
128
|
typedef struct whisper_token_data {
|
73
129
|
whisper_token id; // token id
|
@@ -83,6 +139,11 @@ extern "C" {
|
|
83
139
|
int64_t t0; // start time of the token
|
84
140
|
int64_t t1; // end time of the token
|
85
141
|
|
142
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
143
|
+
// do not use if you haven't computed token-level timestamps with dtw
|
144
|
+
// Roughly corresponds to the moment in audio in which the token was output
|
145
|
+
int64_t t_dtw;
|
146
|
+
|
86
147
|
float vlen; // voice length of the token
|
87
148
|
} whisper_token_data;
|
88
149
|
|
@@ -94,18 +155,102 @@ extern "C" {
|
|
94
155
|
void (*close)(void * ctx);
|
95
156
|
} whisper_model_loader;
|
96
157
|
|
158
|
+
// grammar element type
|
159
|
+
enum whisper_gretype {
|
160
|
+
// end of rule definition
|
161
|
+
WHISPER_GRETYPE_END = 0,
|
162
|
+
|
163
|
+
// start of alternate definition for rule
|
164
|
+
WHISPER_GRETYPE_ALT = 1,
|
165
|
+
|
166
|
+
// non-terminal element: reference to rule
|
167
|
+
WHISPER_GRETYPE_RULE_REF = 2,
|
168
|
+
|
169
|
+
// terminal element: character (code point)
|
170
|
+
WHISPER_GRETYPE_CHAR = 3,
|
171
|
+
|
172
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
173
|
+
WHISPER_GRETYPE_CHAR_NOT = 4,
|
174
|
+
|
175
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
176
|
+
// be an inclusive range ([a-z])
|
177
|
+
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
|
178
|
+
|
179
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or
|
180
|
+
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
181
|
+
WHISPER_GRETYPE_CHAR_ALT = 6,
|
182
|
+
};
|
183
|
+
|
184
|
+
typedef struct whisper_grammar_element {
|
185
|
+
enum whisper_gretype type;
|
186
|
+
uint32_t value; // Unicode code point or rule ID
|
187
|
+
} whisper_grammar_element;
|
188
|
+
|
97
189
|
// Various functions for loading a ggml whisper model.
|
98
190
|
// Allocate (almost) all memory needed for the model.
|
99
191
|
// Return NULL on failure
|
100
|
-
WHISPER_API struct whisper_context *
|
101
|
-
WHISPER_API struct whisper_context *
|
102
|
-
WHISPER_API struct whisper_context *
|
103
|
-
|
104
|
-
//
|
105
|
-
|
192
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
|
193
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
194
|
+
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
|
195
|
+
|
196
|
+
// These are the same as the above, but the internal state of the context is not allocated automatically
|
197
|
+
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
198
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
|
199
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
200
|
+
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
|
201
|
+
|
202
|
+
WHISPER_DEPRECATED(
|
203
|
+
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
|
204
|
+
"use whisper_init_from_file_with_params instead"
|
205
|
+
);
|
206
|
+
WHISPER_DEPRECATED(
|
207
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
|
208
|
+
"use whisper_init_from_buffer_with_params instead"
|
209
|
+
);
|
210
|
+
WHISPER_DEPRECATED(
|
211
|
+
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
|
212
|
+
"use whisper_init_with_params instead"
|
213
|
+
);
|
214
|
+
WHISPER_DEPRECATED(
|
215
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
|
216
|
+
"use whisper_init_from_file_with_params_no_state instead"
|
217
|
+
);
|
218
|
+
WHISPER_DEPRECATED(
|
219
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
|
220
|
+
"use whisper_init_from_buffer_with_params_no_state instead"
|
221
|
+
);
|
222
|
+
WHISPER_DEPRECATED(
|
223
|
+
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
|
224
|
+
"use whisper_init_with_params_no_state instead"
|
225
|
+
);
|
226
|
+
|
227
|
+
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
228
|
+
|
229
|
+
// Given a context, enable use of OpenVINO for encode inference.
|
230
|
+
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
|
231
|
+
// the path will be generated from the ggml model path that was passed
|
232
|
+
// in to whisper_init_from_file. For example, if 'path_model' was
|
233
|
+
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
|
234
|
+
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
|
235
|
+
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
|
236
|
+
// cache_dir: Optional cache directory that can speed up init time, especially for
|
237
|
+
// GPU, by caching compiled 'blobs' there.
|
238
|
+
// Set to nullptr if not used.
|
239
|
+
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
240
|
+
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
241
|
+
struct whisper_context * ctx,
|
242
|
+
const char * model_path,
|
243
|
+
const char * device,
|
244
|
+
const char * cache_dir);
|
245
|
+
|
246
|
+
// Frees all allocated memory
|
247
|
+
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
248
|
+
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
249
|
+
WHISPER_API void whisper_free_params(struct whisper_full_params * params);
|
250
|
+
WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
|
106
251
|
|
107
252
|
// Convert RAW PCM audio to log mel spectrogram.
|
108
|
-
// The resulting spectrogram is stored inside the provided whisper context.
|
253
|
+
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
109
254
|
// Returns 0 on success
|
110
255
|
WHISPER_API int whisper_pcm_to_mel(
|
111
256
|
struct whisper_context * ctx,
|
@@ -113,17 +258,30 @@ extern "C" {
|
|
113
258
|
int n_samples,
|
114
259
|
int n_threads);
|
115
260
|
|
116
|
-
|
117
|
-
|
261
|
+
WHISPER_API int whisper_pcm_to_mel_with_state(
|
262
|
+
struct whisper_context * ctx,
|
263
|
+
struct whisper_state * state,
|
264
|
+
const float * samples,
|
265
|
+
int n_samples,
|
266
|
+
int n_threads);
|
267
|
+
|
268
|
+
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
269
|
+
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
118
270
|
// Returns 0 on success
|
119
271
|
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
120
|
-
struct whisper_context* ctx,
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
272
|
+
struct whisper_context * ctx,
|
273
|
+
const float * samples,
|
274
|
+
int n_samples,
|
275
|
+
int n_threads);
|
276
|
+
|
277
|
+
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
278
|
+
struct whisper_context * ctx,
|
279
|
+
struct whisper_state * state,
|
280
|
+
const float * samples,
|
281
|
+
int n_samples,
|
282
|
+
int n_threads);
|
283
|
+
|
284
|
+
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
127
285
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
128
286
|
// n_mel must be 80
|
129
287
|
// Returns 0 on success
|
@@ -133,7 +291,14 @@ extern "C" {
|
|
133
291
|
int n_len,
|
134
292
|
int n_mel);
|
135
293
|
|
136
|
-
|
294
|
+
WHISPER_API int whisper_set_mel_with_state(
|
295
|
+
struct whisper_context * ctx,
|
296
|
+
struct whisper_state * state,
|
297
|
+
const float * data,
|
298
|
+
int n_len,
|
299
|
+
int n_mel);
|
300
|
+
|
301
|
+
// Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
|
137
302
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
138
303
|
// offset can be used to specify the offset of the first frame in the spectrogram.
|
139
304
|
// Returns 0 on success
|
@@ -142,6 +307,12 @@ extern "C" {
|
|
142
307
|
int offset,
|
143
308
|
int n_threads);
|
144
309
|
|
310
|
+
WHISPER_API int whisper_encode_with_state(
|
311
|
+
struct whisper_context * ctx,
|
312
|
+
struct whisper_state * state,
|
313
|
+
int offset,
|
314
|
+
int n_threads);
|
315
|
+
|
145
316
|
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
146
317
|
// Make sure to call whisper_encode() first.
|
147
318
|
// tokens + n_tokens is the provided context for the decoder.
|
@@ -155,10 +326,18 @@ extern "C" {
|
|
155
326
|
int n_past,
|
156
327
|
int n_threads);
|
157
328
|
|
329
|
+
WHISPER_API int whisper_decode_with_state(
|
330
|
+
struct whisper_context * ctx,
|
331
|
+
struct whisper_state * state,
|
332
|
+
const whisper_token * tokens,
|
333
|
+
int n_tokens,
|
334
|
+
int n_past,
|
335
|
+
int n_threads);
|
336
|
+
|
158
337
|
// Convert the provided text into tokens.
|
159
338
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
160
339
|
// Returns the number of tokens on success, no more than n_max_tokens
|
161
|
-
// Returns
|
340
|
+
// Returns a negative number on failure - the number of tokens that would have been returned
|
162
341
|
// TODO: not sure if correct
|
163
342
|
WHISPER_API int whisper_tokenize(
|
164
343
|
struct whisper_context * ctx,
|
@@ -166,6 +345,10 @@ extern "C" {
|
|
166
345
|
whisper_token * tokens,
|
167
346
|
int n_max_tokens);
|
168
347
|
|
348
|
+
// Return the number of tokens in the provided text
|
349
|
+
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
|
350
|
+
int whisper_token_count(struct whisper_context * ctx, const char * text);
|
351
|
+
|
169
352
|
// Largest language id (i.e. number of available languages - 1)
|
170
353
|
WHISPER_API int whisper_lang_max_id();
|
171
354
|
|
@@ -178,11 +361,14 @@ extern "C" {
|
|
178
361
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
179
362
|
WHISPER_API const char * whisper_lang_str(int id);
|
180
363
|
|
364
|
+
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
|
365
|
+
WHISPER_API const char * whisper_lang_str_full(int id);
|
366
|
+
|
181
367
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
182
368
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
183
369
|
// Returns the top language id or negative on failure
|
184
370
|
// If not null, fills the lang_probs array with the probabilities of all languages
|
185
|
-
// The array must be
|
371
|
+
// The array must be whisper_lang_max_id() + 1 in size
|
186
372
|
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
187
373
|
WHISPER_API int whisper_lang_auto_detect(
|
188
374
|
struct whisper_context * ctx,
|
@@ -190,80 +376,100 @@ extern "C" {
|
|
190
376
|
int n_threads,
|
191
377
|
float * lang_probs);
|
192
378
|
|
193
|
-
WHISPER_API int
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
379
|
+
WHISPER_API int whisper_lang_auto_detect_with_state(
|
380
|
+
struct whisper_context * ctx,
|
381
|
+
struct whisper_state * state,
|
382
|
+
int offset_ms,
|
383
|
+
int n_threads,
|
384
|
+
float * lang_probs);
|
385
|
+
|
386
|
+
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
|
387
|
+
WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
|
388
|
+
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
|
389
|
+
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
|
390
|
+
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
|
391
|
+
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
|
392
|
+
|
393
|
+
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
|
394
|
+
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
|
395
|
+
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
|
396
|
+
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
|
397
|
+
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
|
398
|
+
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
|
399
|
+
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
|
400
|
+
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
401
|
+
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
402
|
+
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
403
|
+
WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
|
404
|
+
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
198
405
|
|
199
406
|
// Token logits obtained from the last call to whisper_decode()
|
200
407
|
// The logits for the last token are stored in the last row
|
201
408
|
// Rows: n_tokens
|
202
409
|
// Cols: n_vocab
|
203
|
-
WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
|
410
|
+
WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
|
411
|
+
WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
|
204
412
|
|
205
413
|
// Token Id -> String. Uses the vocabulary in the provided context
|
206
414
|
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
|
415
|
+
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
|
416
|
+
|
207
417
|
|
208
418
|
// Special tokens
|
209
419
|
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
210
420
|
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
211
|
-
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
212
421
|
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
422
|
+
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
423
|
+
WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
|
213
424
|
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
214
425
|
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
215
426
|
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
216
427
|
|
217
428
|
// Task tokens
|
218
|
-
WHISPER_API whisper_token whisper_token_translate (
|
219
|
-
WHISPER_API whisper_token whisper_token_transcribe(
|
429
|
+
WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
|
430
|
+
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
220
431
|
|
221
|
-
// Performance information
|
432
|
+
// Performance information from the default state.
|
222
433
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
223
434
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
224
435
|
|
225
436
|
// Print system information
|
226
437
|
WHISPER_API const char * whisper_print_system_info(void);
|
227
438
|
|
228
|
-
// Abort a running whisper_full_parallel or whisper_full
|
229
|
-
WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
|
230
|
-
|
231
|
-
// Resume whisper context from an aborted state allowing it run again
|
232
|
-
WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
|
233
|
-
|
234
|
-
// Check the whisper context state if true then it can run if false it can not
|
235
|
-
WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
|
236
|
-
|
237
439
|
////////////////////////////////////////////////////////////////////////////
|
238
440
|
|
239
441
|
// Available sampling strategies
|
240
442
|
enum whisper_sampling_strategy {
|
241
|
-
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's
|
443
|
+
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
|
242
444
|
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
|
243
445
|
};
|
244
446
|
|
245
447
|
// Text segment callback
|
246
448
|
// Called on every newly generated text segment
|
247
449
|
// Use the whisper_full_...() functions to obtain the text segments
|
248
|
-
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
|
450
|
+
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
|
451
|
+
|
452
|
+
// Progress callback
|
453
|
+
typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
|
249
454
|
|
250
455
|
// Encoder begin callback
|
251
456
|
// If not NULL, called before the encoder starts
|
252
457
|
// If it returns false, the computation is aborted
|
253
|
-
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
|
458
|
+
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
|
254
459
|
|
255
460
|
// Logits filter callback
|
256
461
|
// Can be used to modify the logits before sampling
|
257
462
|
// If not NULL, called after applying temperature to logits
|
258
463
|
typedef void (*whisper_logits_filter_callback)(
|
259
464
|
struct whisper_context * ctx,
|
465
|
+
struct whisper_state * state,
|
260
466
|
const whisper_token_data * tokens,
|
261
467
|
int n_tokens,
|
262
468
|
float * logits,
|
263
469
|
void * user_data);
|
264
470
|
|
265
471
|
// Parameters for the whisper_full() function
|
266
|
-
// If you
|
472
|
+
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
267
473
|
// whisper_full_default_params()
|
268
474
|
struct whisper_full_params {
|
269
475
|
enum whisper_sampling_strategy strategy;
|
@@ -275,6 +481,7 @@ extern "C" {
|
|
275
481
|
|
276
482
|
bool translate;
|
277
483
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
484
|
+
bool no_timestamps; // do not generate timestamps
|
278
485
|
bool single_segment; // force single segment output (useful for streaming)
|
279
486
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
280
487
|
bool print_progress; // print progress information
|
@@ -292,15 +499,26 @@ extern "C" {
|
|
292
499
|
// [EXPERIMENTAL] speed-up techniques
|
293
500
|
// note: these can significantly reduce the quality of the output
|
294
501
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
502
|
+
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
295
503
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
296
504
|
|
505
|
+
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
506
|
+
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
507
|
+
|
508
|
+
// A regular expression that matches tokens to suppress
|
509
|
+
const char * suppress_regex;
|
510
|
+
|
297
511
|
// tokens to provide to the whisper decoder as initial prompt
|
298
512
|
// these are prepended to any existing text context from a previous call
|
513
|
+
// use whisper_tokenize() to convert text to tokens
|
514
|
+
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
515
|
+
const char * initial_prompt;
|
299
516
|
const whisper_token * prompt_tokens;
|
300
517
|
int prompt_n_tokens;
|
301
518
|
|
302
519
|
// for auto-detection, set to nullptr, "" or "auto"
|
303
520
|
const char * language;
|
521
|
+
bool detect_language;
|
304
522
|
|
305
523
|
// common decoding parameters:
|
306
524
|
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
@@ -331,18 +549,36 @@ extern "C" {
|
|
331
549
|
whisper_new_segment_callback new_segment_callback;
|
332
550
|
void * new_segment_callback_user_data;
|
333
551
|
|
552
|
+
// called on each progress update
|
553
|
+
whisper_progress_callback progress_callback;
|
554
|
+
void * progress_callback_user_data;
|
555
|
+
|
334
556
|
// called each time before the encoder starts
|
335
557
|
whisper_encoder_begin_callback encoder_begin_callback;
|
336
558
|
void * encoder_begin_callback_user_data;
|
337
559
|
|
560
|
+
// called each time before ggml computation starts
|
561
|
+
ggml_abort_callback abort_callback;
|
562
|
+
void * abort_callback_user_data;
|
563
|
+
|
338
564
|
// called by each decoder to filter obtained logits
|
339
565
|
whisper_logits_filter_callback logits_filter_callback;
|
340
566
|
void * logits_filter_callback_user_data;
|
567
|
+
|
568
|
+
const whisper_grammar_element ** grammar_rules;
|
569
|
+
size_t n_grammar_rules;
|
570
|
+
size_t i_start_rule;
|
571
|
+
float grammar_penalty;
|
341
572
|
};
|
342
573
|
|
574
|
+
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
575
|
+
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
|
576
|
+
WHISPER_API struct whisper_context_params whisper_context_default_params(void);
|
577
|
+
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
|
343
578
|
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
|
344
579
|
|
345
580
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
581
|
+
// Not thread safe for same context
|
346
582
|
// Uses the specified decoding strategy to obtain the text.
|
347
583
|
WHISPER_API int whisper_full(
|
348
584
|
struct whisper_context * ctx,
|
@@ -350,7 +586,16 @@ extern "C" {
|
|
350
586
|
const float * samples,
|
351
587
|
int n_samples);
|
352
588
|
|
353
|
-
|
589
|
+
WHISPER_API int whisper_full_with_state(
|
590
|
+
struct whisper_context * ctx,
|
591
|
+
struct whisper_state * state,
|
592
|
+
struct whisper_full_params params,
|
593
|
+
const float * samples,
|
594
|
+
int n_samples);
|
595
|
+
|
596
|
+
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
597
|
+
// Result is stored in the default state of the context
|
598
|
+
// Not thread safe if executed in parallel on the same context.
|
354
599
|
// It seems this approach can offer some speedup in some cases.
|
355
600
|
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
356
601
|
WHISPER_API int whisper_full_parallel(
|
@@ -360,40 +605,64 @@ extern "C" {
|
|
360
605
|
int n_samples,
|
361
606
|
int n_processors);
|
362
607
|
|
363
|
-
// Number of generated text segments
|
608
|
+
// Number of generated text segments
|
364
609
|
// A segment can be a few words, a sentence, or even a paragraph.
|
365
|
-
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
|
610
|
+
WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
|
611
|
+
WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
|
366
612
|
|
367
|
-
// Language id associated with the
|
613
|
+
// Language id associated with the context's default state
|
368
614
|
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
|
369
615
|
|
370
|
-
//
|
371
|
-
WHISPER_API
|
372
|
-
|
616
|
+
// Language id associated with the provided state
|
617
|
+
WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
|
618
|
+
|
619
|
+
// Get the start and end time of the specified segment
|
620
|
+
WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
|
621
|
+
WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
|
622
|
+
|
623
|
+
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
624
|
+
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
373
625
|
|
374
|
-
// Get the
|
375
|
-
WHISPER_API
|
626
|
+
// Get whether the next segment is predicted as a speaker turn
|
627
|
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
|
628
|
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
|
376
629
|
|
377
|
-
// Get
|
378
|
-
WHISPER_API
|
630
|
+
// Get the text of the specified segment
|
631
|
+
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
632
|
+
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
379
633
|
|
380
|
-
// Get
|
381
|
-
WHISPER_API
|
382
|
-
WHISPER_API
|
634
|
+
// Get number of tokens in the specified segment
|
635
|
+
WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
|
636
|
+
WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
|
383
637
|
|
384
|
-
// Get token
|
638
|
+
// Get the token text of the specified token in the specified segment
|
639
|
+
WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
|
640
|
+
WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
|
641
|
+
|
642
|
+
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
|
643
|
+
WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
|
644
|
+
|
645
|
+
// Get token data for the specified token in the specified segment
|
385
646
|
// This contains probabilities, timestamps, etc.
|
386
|
-
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
|
647
|
+
WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
|
648
|
+
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
|
387
649
|
|
388
|
-
// Get the probability of the specified token in the specified segment
|
389
|
-
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
650
|
+
// Get the probability of the specified token in the specified segment
|
651
|
+
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
|
652
|
+
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
|
390
653
|
|
391
654
|
////////////////////////////////////////////////////////////////////////////
|
392
655
|
|
393
656
|
// Temporary helpers needed for exposing ggml interface
|
394
657
|
|
395
|
-
WHISPER_API int
|
396
|
-
WHISPER_API
|
658
|
+
WHISPER_API int whisper_bench_memcpy (int n_threads);
|
659
|
+
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
|
660
|
+
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
661
|
+
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
662
|
+
|
663
|
+
// Control logging output; default behavior is to print to stderr
|
664
|
+
|
665
|
+
WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
|
397
666
|
|
398
667
|
#ifdef __cplusplus
|
399
668
|
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whispercpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Georgi Gerganov
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2024-05-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: High-performance inference of OpenAI's Whisper automatic speech recognition
|
15
15
|
(ASR) model via Ruby
|
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '0'
|
57
57
|
requirements: []
|
58
|
-
rubygems_version: 3.
|
58
|
+
rubygems_version: 3.5.9
|
59
59
|
signing_key:
|
60
60
|
specification_version: 4
|
61
61
|
summary: Ruby whisper.cpp bindings
|