whispercpp 1.2.0.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +4 -19
- data/ext/extconf.rb +9 -0
- data/ext/ggml.c +18380 -5241
- data/ext/ggml.h +2156 -502
- data/ext/ruby_whisper.cpp +2 -2
- data/ext/whisper.cpp +4184 -1774
- data/ext/whisper.h +348 -56
- metadata +3 -3
data/ext/whisper.h
CHANGED
@@ -1,10 +1,20 @@
|
|
1
1
|
#ifndef WHISPER_H
|
2
2
|
#define WHISPER_H
|
3
3
|
|
4
|
+
#include "ggml.h"
|
5
|
+
|
4
6
|
#include <stddef.h>
|
5
7
|
#include <stdint.h>
|
6
8
|
#include <stdbool.h>
|
7
9
|
|
10
|
+
#ifdef __GNUC__
|
11
|
+
# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
12
|
+
#elif defined(_MSC_VER)
|
13
|
+
# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
14
|
+
#else
|
15
|
+
# define WHISPER_DEPRECATED(func, hint) func
|
16
|
+
#endif
|
17
|
+
|
8
18
|
#ifdef WHISPER_SHARED
|
9
19
|
# ifdef _WIN32
|
10
20
|
# ifdef WHISPER_BUILD
|
@@ -21,7 +31,6 @@
|
|
21
31
|
|
22
32
|
#define WHISPER_SAMPLE_RATE 16000
|
23
33
|
#define WHISPER_N_FFT 400
|
24
|
-
#define WHISPER_N_MEL 80
|
25
34
|
#define WHISPER_HOP_LENGTH 160
|
26
35
|
#define WHISPER_CHUNK_SIZE 30
|
27
36
|
|
@@ -41,7 +50,9 @@ extern "C" {
|
|
41
50
|
//
|
42
51
|
// ...
|
43
52
|
//
|
44
|
-
//
|
53
|
+
// whisper_context_params cparams = whisper_context_default_params();
|
54
|
+
//
|
55
|
+
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
45
56
|
//
|
46
57
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
47
58
|
// fprintf(stderr, "failed to process audio\n");
|
@@ -66,8 +77,53 @@ extern "C" {
|
|
66
77
|
//
|
67
78
|
|
68
79
|
struct whisper_context;
|
80
|
+
struct whisper_state;
|
81
|
+
struct whisper_full_params;
|
82
|
+
|
83
|
+
typedef int32_t whisper_pos;
|
84
|
+
typedef int32_t whisper_token;
|
85
|
+
typedef int32_t whisper_seq_id;
|
86
|
+
|
87
|
+
enum whisper_alignment_heads_preset {
|
88
|
+
WHISPER_AHEADS_NONE,
|
89
|
+
WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
|
90
|
+
WHISPER_AHEADS_CUSTOM,
|
91
|
+
WHISPER_AHEADS_TINY_EN,
|
92
|
+
WHISPER_AHEADS_TINY,
|
93
|
+
WHISPER_AHEADS_BASE_EN,
|
94
|
+
WHISPER_AHEADS_BASE,
|
95
|
+
WHISPER_AHEADS_SMALL_EN,
|
96
|
+
WHISPER_AHEADS_SMALL,
|
97
|
+
WHISPER_AHEADS_MEDIUM_EN,
|
98
|
+
WHISPER_AHEADS_MEDIUM,
|
99
|
+
WHISPER_AHEADS_LARGE_V1,
|
100
|
+
WHISPER_AHEADS_LARGE_V2,
|
101
|
+
WHISPER_AHEADS_LARGE_V3,
|
102
|
+
};
|
103
|
+
|
104
|
+
typedef struct whisper_ahead {
|
105
|
+
int n_text_layer;
|
106
|
+
int n_head;
|
107
|
+
} whisper_ahead;
|
108
|
+
|
109
|
+
typedef struct whisper_aheads {
|
110
|
+
size_t n_heads;
|
111
|
+
const whisper_ahead * heads;
|
112
|
+
} whisper_aheads;
|
113
|
+
|
114
|
+
struct whisper_context_params {
|
115
|
+
bool use_gpu;
|
116
|
+
int gpu_device; // CUDA device
|
117
|
+
|
118
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
119
|
+
bool dtw_token_timestamps;
|
120
|
+
enum whisper_alignment_heads_preset dtw_aheads_preset;
|
69
121
|
|
70
|
-
|
122
|
+
int dtw_n_top;
|
123
|
+
struct whisper_aheads dtw_aheads;
|
124
|
+
|
125
|
+
size_t dtw_mem_size; // TODO: remove
|
126
|
+
};
|
71
127
|
|
72
128
|
typedef struct whisper_token_data {
|
73
129
|
whisper_token id; // token id
|
@@ -83,6 +139,11 @@ extern "C" {
|
|
83
139
|
int64_t t0; // start time of the token
|
84
140
|
int64_t t1; // end time of the token
|
85
141
|
|
142
|
+
// [EXPERIMENTAL] Token-level timestamps with DTW
|
143
|
+
// do not use if you haven't computed token-level timestamps with dtw
|
144
|
+
// Roughly corresponds to the moment in audio in which the token was output
|
145
|
+
int64_t t_dtw;
|
146
|
+
|
86
147
|
float vlen; // voice length of the token
|
87
148
|
} whisper_token_data;
|
88
149
|
|
@@ -94,18 +155,102 @@ extern "C" {
|
|
94
155
|
void (*close)(void * ctx);
|
95
156
|
} whisper_model_loader;
|
96
157
|
|
158
|
+
// grammar element type
|
159
|
+
enum whisper_gretype {
|
160
|
+
// end of rule definition
|
161
|
+
WHISPER_GRETYPE_END = 0,
|
162
|
+
|
163
|
+
// start of alternate definition for rule
|
164
|
+
WHISPER_GRETYPE_ALT = 1,
|
165
|
+
|
166
|
+
// non-terminal element: reference to rule
|
167
|
+
WHISPER_GRETYPE_RULE_REF = 2,
|
168
|
+
|
169
|
+
// terminal element: character (code point)
|
170
|
+
WHISPER_GRETYPE_CHAR = 3,
|
171
|
+
|
172
|
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
173
|
+
WHISPER_GRETYPE_CHAR_NOT = 4,
|
174
|
+
|
175
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
176
|
+
// be an inclusive range ([a-z])
|
177
|
+
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
|
178
|
+
|
179
|
+
// modifies a preceding WHISPER_GRETYPE_CHAR or
|
180
|
+
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
181
|
+
WHISPER_GRETYPE_CHAR_ALT = 6,
|
182
|
+
};
|
183
|
+
|
184
|
+
typedef struct whisper_grammar_element {
|
185
|
+
enum whisper_gretype type;
|
186
|
+
uint32_t value; // Unicode code point or rule ID
|
187
|
+
} whisper_grammar_element;
|
188
|
+
|
97
189
|
// Various functions for loading a ggml whisper model.
|
98
190
|
// Allocate (almost) all memory needed for the model.
|
99
191
|
// Return NULL on failure
|
100
|
-
WHISPER_API struct whisper_context *
|
101
|
-
WHISPER_API struct whisper_context *
|
102
|
-
WHISPER_API struct whisper_context *
|
103
|
-
|
104
|
-
//
|
105
|
-
|
192
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
|
193
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
194
|
+
WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
|
195
|
+
|
196
|
+
// These are the same as the above, but the internal state of the context is not allocated automatically
|
197
|
+
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
198
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
|
199
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
|
200
|
+
WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
|
201
|
+
|
202
|
+
WHISPER_DEPRECATED(
|
203
|
+
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
|
204
|
+
"use whisper_init_from_file_with_params instead"
|
205
|
+
);
|
206
|
+
WHISPER_DEPRECATED(
|
207
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
|
208
|
+
"use whisper_init_from_buffer_with_params instead"
|
209
|
+
);
|
210
|
+
WHISPER_DEPRECATED(
|
211
|
+
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
|
212
|
+
"use whisper_init_with_params instead"
|
213
|
+
);
|
214
|
+
WHISPER_DEPRECATED(
|
215
|
+
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
|
216
|
+
"use whisper_init_from_file_with_params_no_state instead"
|
217
|
+
);
|
218
|
+
WHISPER_DEPRECATED(
|
219
|
+
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
|
220
|
+
"use whisper_init_from_buffer_with_params_no_state instead"
|
221
|
+
);
|
222
|
+
WHISPER_DEPRECATED(
|
223
|
+
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
|
224
|
+
"use whisper_init_with_params_no_state instead"
|
225
|
+
);
|
226
|
+
|
227
|
+
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
228
|
+
|
229
|
+
// Given a context, enable use of OpenVINO for encode inference.
|
230
|
+
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
|
231
|
+
// the path will be generated from the ggml model path that was passed
|
232
|
+
// in to whisper_init_from_file. For example, if 'path_model' was
|
233
|
+
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
|
234
|
+
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
|
235
|
+
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
|
236
|
+
// cache_dir: Optional cache directory that can speed up init time, especially for
|
237
|
+
// GPU, by caching compiled 'blobs' there.
|
238
|
+
// Set to nullptr if not used.
|
239
|
+
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
240
|
+
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
241
|
+
struct whisper_context * ctx,
|
242
|
+
const char * model_path,
|
243
|
+
const char * device,
|
244
|
+
const char * cache_dir);
|
245
|
+
|
246
|
+
// Frees all allocated memory
|
247
|
+
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
248
|
+
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
249
|
+
WHISPER_API void whisper_free_params(struct whisper_full_params * params);
|
250
|
+
WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
|
106
251
|
|
107
252
|
// Convert RAW PCM audio to log mel spectrogram.
|
108
|
-
// The resulting spectrogram is stored inside the provided whisper context.
|
253
|
+
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
109
254
|
// Returns 0 on success
|
110
255
|
WHISPER_API int whisper_pcm_to_mel(
|
111
256
|
struct whisper_context * ctx,
|
@@ -113,17 +258,30 @@ extern "C" {
|
|
113
258
|
int n_samples,
|
114
259
|
int n_threads);
|
115
260
|
|
116
|
-
|
117
|
-
|
261
|
+
WHISPER_API int whisper_pcm_to_mel_with_state(
|
262
|
+
struct whisper_context * ctx,
|
263
|
+
struct whisper_state * state,
|
264
|
+
const float * samples,
|
265
|
+
int n_samples,
|
266
|
+
int n_threads);
|
267
|
+
|
268
|
+
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
269
|
+
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
118
270
|
// Returns 0 on success
|
119
271
|
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
120
|
-
struct whisper_context* ctx,
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
272
|
+
struct whisper_context * ctx,
|
273
|
+
const float * samples,
|
274
|
+
int n_samples,
|
275
|
+
int n_threads);
|
276
|
+
|
277
|
+
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
278
|
+
struct whisper_context * ctx,
|
279
|
+
struct whisper_state * state,
|
280
|
+
const float * samples,
|
281
|
+
int n_samples,
|
282
|
+
int n_threads);
|
283
|
+
|
284
|
+
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
127
285
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
128
286
|
// n_mel must be 80
|
129
287
|
// Returns 0 on success
|
@@ -133,7 +291,14 @@ extern "C" {
|
|
133
291
|
int n_len,
|
134
292
|
int n_mel);
|
135
293
|
|
136
|
-
|
294
|
+
WHISPER_API int whisper_set_mel_with_state(
|
295
|
+
struct whisper_context * ctx,
|
296
|
+
struct whisper_state * state,
|
297
|
+
const float * data,
|
298
|
+
int n_len,
|
299
|
+
int n_mel);
|
300
|
+
|
301
|
+
// Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
|
137
302
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
138
303
|
// offset can be used to specify the offset of the first frame in the spectrogram.
|
139
304
|
// Returns 0 on success
|
@@ -142,6 +307,12 @@ extern "C" {
|
|
142
307
|
int offset,
|
143
308
|
int n_threads);
|
144
309
|
|
310
|
+
WHISPER_API int whisper_encode_with_state(
|
311
|
+
struct whisper_context * ctx,
|
312
|
+
struct whisper_state * state,
|
313
|
+
int offset,
|
314
|
+
int n_threads);
|
315
|
+
|
145
316
|
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
146
317
|
// Make sure to call whisper_encode() first.
|
147
318
|
// tokens + n_tokens is the provided context for the decoder.
|
@@ -155,10 +326,18 @@ extern "C" {
|
|
155
326
|
int n_past,
|
156
327
|
int n_threads);
|
157
328
|
|
329
|
+
WHISPER_API int whisper_decode_with_state(
|
330
|
+
struct whisper_context * ctx,
|
331
|
+
struct whisper_state * state,
|
332
|
+
const whisper_token * tokens,
|
333
|
+
int n_tokens,
|
334
|
+
int n_past,
|
335
|
+
int n_threads);
|
336
|
+
|
158
337
|
// Convert the provided text into tokens.
|
159
338
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
160
339
|
// Returns the number of tokens on success, no more than n_max_tokens
|
161
|
-
// Returns
|
340
|
+
// Returns a negative number on failure - the number of tokens that would have been returned
|
162
341
|
// TODO: not sure if correct
|
163
342
|
WHISPER_API int whisper_tokenize(
|
164
343
|
struct whisper_context * ctx,
|
@@ -166,6 +345,10 @@ extern "C" {
|
|
166
345
|
whisper_token * tokens,
|
167
346
|
int n_max_tokens);
|
168
347
|
|
348
|
+
// Return the number of tokens in the provided text
|
349
|
+
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
|
350
|
+
int whisper_token_count(struct whisper_context * ctx, const char * text);
|
351
|
+
|
169
352
|
// Largest language id (i.e. number of available languages - 1)
|
170
353
|
WHISPER_API int whisper_lang_max_id();
|
171
354
|
|
@@ -178,11 +361,14 @@ extern "C" {
|
|
178
361
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
179
362
|
WHISPER_API const char * whisper_lang_str(int id);
|
180
363
|
|
364
|
+
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
|
365
|
+
WHISPER_API const char * whisper_lang_str_full(int id);
|
366
|
+
|
181
367
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
182
368
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
183
369
|
// Returns the top language id or negative on failure
|
184
370
|
// If not null, fills the lang_probs array with the probabilities of all languages
|
185
|
-
// The array must be
|
371
|
+
// The array must be whisper_lang_max_id() + 1 in size
|
186
372
|
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
187
373
|
WHISPER_API int whisper_lang_auto_detect(
|
188
374
|
struct whisper_context * ctx,
|
@@ -190,35 +376,60 @@ extern "C" {
|
|
190
376
|
int n_threads,
|
191
377
|
float * lang_probs);
|
192
378
|
|
193
|
-
WHISPER_API int
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
379
|
+
WHISPER_API int whisper_lang_auto_detect_with_state(
|
380
|
+
struct whisper_context * ctx,
|
381
|
+
struct whisper_state * state,
|
382
|
+
int offset_ms,
|
383
|
+
int n_threads,
|
384
|
+
float * lang_probs);
|
385
|
+
|
386
|
+
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
|
387
|
+
WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
|
388
|
+
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
|
389
|
+
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
|
390
|
+
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
|
391
|
+
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
|
392
|
+
|
393
|
+
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
|
394
|
+
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
|
395
|
+
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
|
396
|
+
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
|
397
|
+
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
|
398
|
+
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
|
399
|
+
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
|
400
|
+
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
401
|
+
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
402
|
+
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
403
|
+
WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
|
404
|
+
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
198
405
|
|
199
406
|
// Token logits obtained from the last call to whisper_decode()
|
200
407
|
// The logits for the last token are stored in the last row
|
201
408
|
// Rows: n_tokens
|
202
409
|
// Cols: n_vocab
|
203
|
-
WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
|
410
|
+
WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
|
411
|
+
WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
|
204
412
|
|
205
413
|
// Token Id -> String. Uses the vocabulary in the provided context
|
206
414
|
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
|
415
|
+
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
|
416
|
+
|
207
417
|
|
208
418
|
// Special tokens
|
209
419
|
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
210
420
|
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
211
|
-
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
212
421
|
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
422
|
+
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
423
|
+
WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
|
213
424
|
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
214
425
|
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
215
426
|
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
216
427
|
|
217
428
|
// Task tokens
|
218
|
-
WHISPER_API whisper_token whisper_token_translate (
|
219
|
-
WHISPER_API whisper_token whisper_token_transcribe(
|
429
|
+
WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
|
430
|
+
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
220
431
|
|
221
|
-
// Performance information
|
432
|
+
// Performance information from the default state.
|
222
433
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
223
434
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
224
435
|
|
@@ -229,22 +440,36 @@ extern "C" {
|
|
229
440
|
|
230
441
|
// Available sampling strategies
|
231
442
|
enum whisper_sampling_strategy {
|
232
|
-
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's
|
443
|
+
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
|
233
444
|
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
|
234
445
|
};
|
235
446
|
|
236
447
|
// Text segment callback
|
237
448
|
// Called on every newly generated text segment
|
238
449
|
// Use the whisper_full_...() functions to obtain the text segments
|
239
|
-
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
|
450
|
+
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
|
451
|
+
|
452
|
+
// Progress callback
|
453
|
+
typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
|
240
454
|
|
241
455
|
// Encoder begin callback
|
242
456
|
// If not NULL, called before the encoder starts
|
243
457
|
// If it returns false, the computation is aborted
|
244
|
-
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
|
458
|
+
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
|
459
|
+
|
460
|
+
// Logits filter callback
|
461
|
+
// Can be used to modify the logits before sampling
|
462
|
+
// If not NULL, called after applying temperature to logits
|
463
|
+
typedef void (*whisper_logits_filter_callback)(
|
464
|
+
struct whisper_context * ctx,
|
465
|
+
struct whisper_state * state,
|
466
|
+
const whisper_token_data * tokens,
|
467
|
+
int n_tokens,
|
468
|
+
float * logits,
|
469
|
+
void * user_data);
|
245
470
|
|
246
471
|
// Parameters for the whisper_full() function
|
247
|
-
// If you
|
472
|
+
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
248
473
|
// whisper_full_default_params()
|
249
474
|
struct whisper_full_params {
|
250
475
|
enum whisper_sampling_strategy strategy;
|
@@ -256,6 +481,7 @@ extern "C" {
|
|
256
481
|
|
257
482
|
bool translate;
|
258
483
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
484
|
+
bool no_timestamps; // do not generate timestamps
|
259
485
|
bool single_segment; // force single segment output (useful for streaming)
|
260
486
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
261
487
|
bool print_progress; // print progress information
|
@@ -273,15 +499,26 @@ extern "C" {
|
|
273
499
|
// [EXPERIMENTAL] speed-up techniques
|
274
500
|
// note: these can significantly reduce the quality of the output
|
275
501
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
502
|
+
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
276
503
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
277
504
|
|
505
|
+
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
506
|
+
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
507
|
+
|
508
|
+
// A regular expression that matches tokens to suppress
|
509
|
+
const char * suppress_regex;
|
510
|
+
|
278
511
|
// tokens to provide to the whisper decoder as initial prompt
|
279
512
|
// these are prepended to any existing text context from a previous call
|
513
|
+
// use whisper_tokenize() to convert text to tokens
|
514
|
+
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
|
515
|
+
const char * initial_prompt;
|
280
516
|
const whisper_token * prompt_tokens;
|
281
517
|
int prompt_n_tokens;
|
282
518
|
|
283
519
|
// for auto-detection, set to nullptr, "" or "auto"
|
284
520
|
const char * language;
|
521
|
+
bool detect_language;
|
285
522
|
|
286
523
|
// common decoding parameters:
|
287
524
|
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
@@ -312,14 +549,36 @@ extern "C" {
|
|
312
549
|
whisper_new_segment_callback new_segment_callback;
|
313
550
|
void * new_segment_callback_user_data;
|
314
551
|
|
552
|
+
// called on each progress update
|
553
|
+
whisper_progress_callback progress_callback;
|
554
|
+
void * progress_callback_user_data;
|
555
|
+
|
315
556
|
// called each time before the encoder starts
|
316
557
|
whisper_encoder_begin_callback encoder_begin_callback;
|
317
558
|
void * encoder_begin_callback_user_data;
|
559
|
+
|
560
|
+
// called each time before ggml computation starts
|
561
|
+
ggml_abort_callback abort_callback;
|
562
|
+
void * abort_callback_user_data;
|
563
|
+
|
564
|
+
// called by each decoder to filter obtained logits
|
565
|
+
whisper_logits_filter_callback logits_filter_callback;
|
566
|
+
void * logits_filter_callback_user_data;
|
567
|
+
|
568
|
+
const whisper_grammar_element ** grammar_rules;
|
569
|
+
size_t n_grammar_rules;
|
570
|
+
size_t i_start_rule;
|
571
|
+
float grammar_penalty;
|
318
572
|
};
|
319
573
|
|
574
|
+
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
|
575
|
+
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
|
576
|
+
WHISPER_API struct whisper_context_params whisper_context_default_params(void);
|
577
|
+
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
|
320
578
|
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
|
321
579
|
|
322
580
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
581
|
+
// Not thread safe for same context
|
323
582
|
// Uses the specified decoding strategy to obtain the text.
|
324
583
|
WHISPER_API int whisper_full(
|
325
584
|
struct whisper_context * ctx,
|
@@ -327,7 +586,16 @@ extern "C" {
|
|
327
586
|
const float * samples,
|
328
587
|
int n_samples);
|
329
588
|
|
330
|
-
|
589
|
+
WHISPER_API int whisper_full_with_state(
|
590
|
+
struct whisper_context * ctx,
|
591
|
+
struct whisper_state * state,
|
592
|
+
struct whisper_full_params params,
|
593
|
+
const float * samples,
|
594
|
+
int n_samples);
|
595
|
+
|
596
|
+
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
597
|
+
// Result is stored in the default state of the context
|
598
|
+
// Not thread safe if executed in parallel on the same context.
|
331
599
|
// It seems this approach can offer some speedup in some cases.
|
332
600
|
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
333
601
|
WHISPER_API int whisper_full_parallel(
|
@@ -337,40 +605,64 @@ extern "C" {
|
|
337
605
|
int n_samples,
|
338
606
|
int n_processors);
|
339
607
|
|
340
|
-
// Number of generated text segments
|
608
|
+
// Number of generated text segments
|
341
609
|
// A segment can be a few words, a sentence, or even a paragraph.
|
342
|
-
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
|
610
|
+
WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
|
611
|
+
WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
|
343
612
|
|
344
|
-
// Language id associated with the
|
613
|
+
// Language id associated with the context's default state
|
345
614
|
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
|
346
615
|
|
347
|
-
//
|
348
|
-
WHISPER_API
|
349
|
-
|
616
|
+
// Language id associated with the provided state
|
617
|
+
WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
|
618
|
+
|
619
|
+
// Get the start and end time of the specified segment
|
620
|
+
WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
|
621
|
+
WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
|
622
|
+
|
623
|
+
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
624
|
+
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
350
625
|
|
351
|
-
// Get the
|
352
|
-
WHISPER_API
|
626
|
+
// Get whether the next segment is predicted as a speaker turn
|
627
|
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
|
628
|
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
|
353
629
|
|
354
|
-
// Get
|
355
|
-
WHISPER_API
|
630
|
+
// Get the text of the specified segment
|
631
|
+
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
632
|
+
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
356
633
|
|
357
|
-
// Get
|
358
|
-
WHISPER_API
|
359
|
-
WHISPER_API
|
634
|
+
// Get number of tokens in the specified segment
|
635
|
+
WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
|
636
|
+
WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
|
360
637
|
|
361
|
-
// Get token
|
638
|
+
// Get the token text of the specified token in the specified segment
|
639
|
+
WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
|
640
|
+
WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
|
641
|
+
|
642
|
+
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
|
643
|
+
WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
|
644
|
+
|
645
|
+
// Get token data for the specified token in the specified segment
|
362
646
|
// This contains probabilities, timestamps, etc.
|
363
|
-
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
|
647
|
+
WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
|
648
|
+
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
|
364
649
|
|
365
|
-
// Get the probability of the specified token in the specified segment
|
366
|
-
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
650
|
+
// Get the probability of the specified token in the specified segment
|
651
|
+
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
|
652
|
+
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
|
367
653
|
|
368
654
|
////////////////////////////////////////////////////////////////////////////
|
369
655
|
|
370
656
|
// Temporary helpers needed for exposing ggml interface
|
371
657
|
|
372
|
-
WHISPER_API int
|
373
|
-
WHISPER_API
|
658
|
+
WHISPER_API int whisper_bench_memcpy (int n_threads);
|
659
|
+
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
|
660
|
+
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
661
|
+
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
662
|
+
|
663
|
+
// Control logging output; default behavior is to print to stderr
|
664
|
+
|
665
|
+
WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
|
374
666
|
|
375
667
|
#ifdef __cplusplus
|
376
668
|
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whispercpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Georgi Gerganov
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2024-05-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: High-performance inference of OpenAI's Whisper automatic speech recognition
|
15
15
|
(ASR) model via Ruby
|
@@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '0'
|
57
57
|
requirements: []
|
58
|
-
rubygems_version: 3.
|
58
|
+
rubygems_version: 3.5.9
|
59
59
|
signing_key:
|
60
60
|
specification_version: 4
|
61
61
|
summary: Ruby whisper.cpp bindings
|