whispercpp 1.2.0.2 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/LICENSE +1 -1
- data/README.md +165 -434
- data/Rakefile +46 -86
- data/ext/.gitignore +13 -0
- data/ext/cpu.mk +9 -0
- data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
- data/ext/extconf.rb +185 -7
- data/ext/ggml/include/ggml-alloc.h +76 -0
- data/ext/ggml/include/ggml-backend.h +352 -0
- data/ext/ggml/include/ggml-blas.h +25 -0
- data/ext/ggml/include/ggml-cann.h +123 -0
- data/ext/ggml/include/ggml-cpp.h +38 -0
- data/ext/ggml/include/ggml-cpu.h +135 -0
- data/ext/ggml/include/ggml-cuda.h +47 -0
- data/ext/ggml/include/ggml-kompute.h +50 -0
- data/ext/ggml/include/ggml-metal.h +66 -0
- data/ext/ggml/include/ggml-opencl.h +26 -0
- data/ext/ggml/include/ggml-opt.h +216 -0
- data/ext/ggml/include/ggml-rpc.h +28 -0
- data/ext/ggml/include/ggml-sycl.h +49 -0
- data/ext/ggml/include/ggml-vulkan.h +31 -0
- data/ext/ggml/include/ggml.h +2285 -0
- data/ext/ggml/src/ggml-alloc.c +1037 -0
- data/ext/ggml/src/ggml-amx/common.h +94 -0
- data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
- data/ext/ggml/src/ggml-amx/mmq.h +17 -0
- data/ext/ggml/src/ggml-backend-impl.h +256 -0
- data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
- data/ext/ggml/src/ggml-backend.cpp +1999 -0
- data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
- data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
- data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- data/ext/ggml/src/ggml-cann/common.h +286 -0
- data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
- data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
- data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
- data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
- data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
- data/ext/ggml/src/ggml-common.h +1853 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
- data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
- data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
- data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- data/ext/ggml/src/ggml-impl.h +556 -0
- data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
- data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
- data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
- data/ext/ggml/src/ggml-opt.cpp +854 -0
- data/ext/ggml/src/ggml-quants.c +5238 -0
- data/ext/ggml/src/ggml-quants.h +100 -0
- data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
- data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
- data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
- data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
- data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
- data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
- data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
- data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
- data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
- data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
- data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
- data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
- data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
- data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- data/ext/ggml/src/ggml-threading.cpp +12 -0
- data/ext/ggml/src/ggml-threading.h +14 -0
- data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
- data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- data/ext/ggml/src/ggml.c +7694 -0
- data/ext/include/whisper.h +672 -0
- data/ext/metal-embed.mk +17 -0
- data/ext/metal.mk +6 -0
- data/ext/ruby_whisper.cpp +1608 -159
- data/ext/ruby_whisper.h +10 -0
- data/ext/scripts/get-flags.mk +38 -0
- data/ext/src/coreml/whisper-decoder-impl.h +146 -0
- data/ext/src/coreml/whisper-decoder-impl.m +201 -0
- data/ext/src/coreml/whisper-encoder-impl.h +142 -0
- data/ext/src/coreml/whisper-encoder-impl.m +197 -0
- data/ext/src/coreml/whisper-encoder.h +26 -0
- data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
- data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
- data/ext/src/whisper.cpp +7393 -0
- data/extsources.rb +6 -0
- data/lib/whisper/model/uri.rb +157 -0
- data/lib/whisper.rb +2 -0
- data/tests/helper.rb +7 -0
- data/tests/jfk_reader/.gitignore +5 -0
- data/tests/jfk_reader/extconf.rb +3 -0
- data/tests/jfk_reader/jfk_reader.c +68 -0
- data/tests/test_callback.rb +160 -0
- data/tests/test_error.rb +20 -0
- data/tests/test_model.rb +71 -0
- data/tests/test_package.rb +31 -0
- data/tests/test_params.rb +160 -0
- data/tests/test_segment.rb +83 -0
- data/tests/test_whisper.rb +211 -123
- data/whispercpp.gemspec +36 -0
- metadata +137 -11
- data/ext/ggml.c +0 -8616
- data/ext/ggml.h +0 -748
- data/ext/whisper.cpp +0 -4829
- data/ext/whisper.h +0 -402
data/ext/whisper.h
DELETED
@@ -1,402 +0,0 @@
|
|
1
|
-
#ifndef WHISPER_H
|
2
|
-
#define WHISPER_H
|
3
|
-
|
4
|
-
#include <stddef.h>
|
5
|
-
#include <stdint.h>
|
6
|
-
#include <stdbool.h>
|
7
|
-
|
8
|
-
#ifdef WHISPER_SHARED
|
9
|
-
# ifdef _WIN32
|
10
|
-
# ifdef WHISPER_BUILD
|
11
|
-
# define WHISPER_API __declspec(dllexport)
|
12
|
-
# else
|
13
|
-
# define WHISPER_API __declspec(dllimport)
|
14
|
-
# endif
|
15
|
-
# else
|
16
|
-
# define WHISPER_API __attribute__ ((visibility ("default")))
|
17
|
-
# endif
|
18
|
-
#else
|
19
|
-
# define WHISPER_API
|
20
|
-
#endif
|
21
|
-
|
22
|
-
#define WHISPER_SAMPLE_RATE 16000
|
23
|
-
#define WHISPER_N_FFT 400
|
24
|
-
#define WHISPER_N_MEL 80
|
25
|
-
#define WHISPER_HOP_LENGTH 160
|
26
|
-
#define WHISPER_CHUNK_SIZE 30
|
27
|
-
|
28
|
-
#ifdef __cplusplus
|
29
|
-
extern "C" {
|
30
|
-
#endif
|
31
|
-
|
32
|
-
//
|
33
|
-
// C interface
|
34
|
-
//
|
35
|
-
// The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
|
36
|
-
// concurrently.
|
37
|
-
//
|
38
|
-
// Basic usage:
|
39
|
-
//
|
40
|
-
// #include "whisper.h"
|
41
|
-
//
|
42
|
-
// ...
|
43
|
-
//
|
44
|
-
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
|
45
|
-
//
|
46
|
-
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
47
|
-
// fprintf(stderr, "failed to process audio\n");
|
48
|
-
// return 7;
|
49
|
-
// }
|
50
|
-
//
|
51
|
-
// const int n_segments = whisper_full_n_segments(ctx);
|
52
|
-
// for (int i = 0; i < n_segments; ++i) {
|
53
|
-
// const char * text = whisper_full_get_segment_text(ctx, i);
|
54
|
-
// printf("%s", text);
|
55
|
-
// }
|
56
|
-
//
|
57
|
-
// whisper_free(ctx);
|
58
|
-
//
|
59
|
-
// ...
|
60
|
-
//
|
61
|
-
// This is a demonstration of the most straightforward usage of the library.
|
62
|
-
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
|
63
|
-
//
|
64
|
-
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
|
65
|
-
// understanding of how the model works.
|
66
|
-
//
|
67
|
-
|
68
|
-
struct whisper_context;
|
69
|
-
|
70
|
-
typedef int whisper_token;
|
71
|
-
|
72
|
-
typedef struct whisper_token_data {
|
73
|
-
whisper_token id; // token id
|
74
|
-
whisper_token tid; // forced timestamp token id
|
75
|
-
|
76
|
-
float p; // probability of the token
|
77
|
-
float plog; // log probability of the token
|
78
|
-
float pt; // probability of the timestamp token
|
79
|
-
float ptsum; // sum of probabilities of all timestamp tokens
|
80
|
-
|
81
|
-
// token-level timestamp data
|
82
|
-
// do not use if you haven't computed token-level timestamps
|
83
|
-
int64_t t0; // start time of the token
|
84
|
-
int64_t t1; // end time of the token
|
85
|
-
|
86
|
-
float vlen; // voice length of the token
|
87
|
-
} whisper_token_data;
|
88
|
-
|
89
|
-
typedef struct whisper_model_loader {
|
90
|
-
void * context;
|
91
|
-
|
92
|
-
size_t (*read)(void * ctx, void * output, size_t read_size);
|
93
|
-
bool (*eof)(void * ctx);
|
94
|
-
void (*close)(void * ctx);
|
95
|
-
} whisper_model_loader;
|
96
|
-
|
97
|
-
// Various functions for loading a ggml whisper model.
|
98
|
-
// Allocate (almost) all memory needed for the model.
|
99
|
-
// Return NULL on failure
|
100
|
-
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
|
101
|
-
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
|
102
|
-
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
|
103
|
-
|
104
|
-
// Frees all memory allocated by the model.
|
105
|
-
WHISPER_API void whisper_free(struct whisper_context * ctx);
|
106
|
-
|
107
|
-
// Convert RAW PCM audio to log mel spectrogram.
|
108
|
-
// The resulting spectrogram is stored inside the provided whisper context.
|
109
|
-
// Returns 0 on success
|
110
|
-
WHISPER_API int whisper_pcm_to_mel(
|
111
|
-
struct whisper_context * ctx,
|
112
|
-
const float * samples,
|
113
|
-
int n_samples,
|
114
|
-
int n_threads);
|
115
|
-
|
116
|
-
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
117
|
-
// The resulting spectrogram is stored inside the provided whisper context.
|
118
|
-
// Returns 0 on success
|
119
|
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
120
|
-
struct whisper_context* ctx,
|
121
|
-
const float* samples,
|
122
|
-
int n_samples,
|
123
|
-
int n_threads);
|
124
|
-
|
125
|
-
|
126
|
-
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
127
|
-
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
128
|
-
// n_mel must be 80
|
129
|
-
// Returns 0 on success
|
130
|
-
WHISPER_API int whisper_set_mel(
|
131
|
-
struct whisper_context * ctx,
|
132
|
-
const float * data,
|
133
|
-
int n_len,
|
134
|
-
int n_mel);
|
135
|
-
|
136
|
-
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
|
137
|
-
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
138
|
-
// offset can be used to specify the offset of the first frame in the spectrogram.
|
139
|
-
// Returns 0 on success
|
140
|
-
WHISPER_API int whisper_encode(
|
141
|
-
struct whisper_context * ctx,
|
142
|
-
int offset,
|
143
|
-
int n_threads);
|
144
|
-
|
145
|
-
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
146
|
-
// Make sure to call whisper_encode() first.
|
147
|
-
// tokens + n_tokens is the provided context for the decoder.
|
148
|
-
// n_past is the number of tokens to use from previous decoder calls.
|
149
|
-
// Returns 0 on success
|
150
|
-
// TODO: add support for multiple decoders
|
151
|
-
WHISPER_API int whisper_decode(
|
152
|
-
struct whisper_context * ctx,
|
153
|
-
const whisper_token * tokens,
|
154
|
-
int n_tokens,
|
155
|
-
int n_past,
|
156
|
-
int n_threads);
|
157
|
-
|
158
|
-
// Convert the provided text into tokens.
|
159
|
-
// The tokens pointer must be large enough to hold the resulting tokens.
|
160
|
-
// Returns the number of tokens on success, no more than n_max_tokens
|
161
|
-
// Returns -1 on failure
|
162
|
-
// TODO: not sure if correct
|
163
|
-
WHISPER_API int whisper_tokenize(
|
164
|
-
struct whisper_context * ctx,
|
165
|
-
const char * text,
|
166
|
-
whisper_token * tokens,
|
167
|
-
int n_max_tokens);
|
168
|
-
|
169
|
-
// Largest language id (i.e. number of available languages - 1)
|
170
|
-
WHISPER_API int whisper_lang_max_id();
|
171
|
-
|
172
|
-
// Return the id of the specified language, returns -1 if not found
|
173
|
-
// Examples:
|
174
|
-
// "de" -> 2
|
175
|
-
// "german" -> 2
|
176
|
-
WHISPER_API int whisper_lang_id(const char * lang);
|
177
|
-
|
178
|
-
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
179
|
-
WHISPER_API const char * whisper_lang_str(int id);
|
180
|
-
|
181
|
-
// Use mel data at offset_ms to try and auto-detect the spoken language
|
182
|
-
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
183
|
-
// Returns the top language id or negative on failure
|
184
|
-
// If not null, fills the lang_probs array with the probabilities of all languages
|
185
|
-
// The array must be whispe_lang_max_id() + 1 in size
|
186
|
-
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
187
|
-
WHISPER_API int whisper_lang_auto_detect(
|
188
|
-
struct whisper_context * ctx,
|
189
|
-
int offset_ms,
|
190
|
-
int n_threads,
|
191
|
-
float * lang_probs);
|
192
|
-
|
193
|
-
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
|
194
|
-
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
|
195
|
-
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
|
196
|
-
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
|
197
|
-
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
|
198
|
-
|
199
|
-
// Token logits obtained from the last call to whisper_decode()
|
200
|
-
// The logits for the last token are stored in the last row
|
201
|
-
// Rows: n_tokens
|
202
|
-
// Cols: n_vocab
|
203
|
-
WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
|
204
|
-
|
205
|
-
// Token Id -> String. Uses the vocabulary in the provided context
|
206
|
-
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
|
207
|
-
|
208
|
-
// Special tokens
|
209
|
-
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
210
|
-
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
211
|
-
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
212
|
-
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
213
|
-
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
214
|
-
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
215
|
-
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
216
|
-
|
217
|
-
// Task tokens
|
218
|
-
WHISPER_API whisper_token whisper_token_translate (void);
|
219
|
-
WHISPER_API whisper_token whisper_token_transcribe(void);
|
220
|
-
|
221
|
-
// Performance information
|
222
|
-
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
223
|
-
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
224
|
-
|
225
|
-
// Print system information
|
226
|
-
WHISPER_API const char * whisper_print_system_info(void);
|
227
|
-
|
228
|
-
// Abort a running whisper_full_parallel or whisper_full
|
229
|
-
WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
|
230
|
-
|
231
|
-
// Resume whisper context from an aborted state allowing it run again
|
232
|
-
WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
|
233
|
-
|
234
|
-
// Check the whisper context state if true then it can run if false it can not
|
235
|
-
WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
|
236
|
-
|
237
|
-
////////////////////////////////////////////////////////////////////////////
|
238
|
-
|
239
|
-
// Available sampling strategies
|
240
|
-
enum whisper_sampling_strategy {
|
241
|
-
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
|
242
|
-
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
|
243
|
-
};
|
244
|
-
|
245
|
-
// Text segment callback
|
246
|
-
// Called on every newly generated text segment
|
247
|
-
// Use the whisper_full_...() functions to obtain the text segments
|
248
|
-
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
|
249
|
-
|
250
|
-
// Encoder begin callback
|
251
|
-
// If not NULL, called before the encoder starts
|
252
|
-
// If it returns false, the computation is aborted
|
253
|
-
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
|
254
|
-
|
255
|
-
// Logits filter callback
|
256
|
-
// Can be used to modify the logits before sampling
|
257
|
-
// If not NULL, called after applying temperature to logits
|
258
|
-
typedef void (*whisper_logits_filter_callback)(
|
259
|
-
struct whisper_context * ctx,
|
260
|
-
const whisper_token_data * tokens,
|
261
|
-
int n_tokens,
|
262
|
-
float * logits,
|
263
|
-
void * user_data);
|
264
|
-
|
265
|
-
// Parameters for the whisper_full() function
|
266
|
-
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
|
267
|
-
// whisper_full_default_params()
|
268
|
-
struct whisper_full_params {
|
269
|
-
enum whisper_sampling_strategy strategy;
|
270
|
-
|
271
|
-
int n_threads;
|
272
|
-
int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
|
273
|
-
int offset_ms; // start offset in ms
|
274
|
-
int duration_ms; // audio duration to process in ms
|
275
|
-
|
276
|
-
bool translate;
|
277
|
-
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
278
|
-
bool single_segment; // force single segment output (useful for streaming)
|
279
|
-
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
280
|
-
bool print_progress; // print progress information
|
281
|
-
bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
|
282
|
-
bool print_timestamps; // print timestamps for each text segment when printing realtime
|
283
|
-
|
284
|
-
// [EXPERIMENTAL] token-level timestamps
|
285
|
-
bool token_timestamps; // enable token-level timestamps
|
286
|
-
float thold_pt; // timestamp token probability threshold (~0.01)
|
287
|
-
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
288
|
-
int max_len; // max segment length in characters
|
289
|
-
bool split_on_word; // split on word rather than on token (when used with max_len)
|
290
|
-
int max_tokens; // max tokens per segment (0 = no limit)
|
291
|
-
|
292
|
-
// [EXPERIMENTAL] speed-up techniques
|
293
|
-
// note: these can significantly reduce the quality of the output
|
294
|
-
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
295
|
-
int audio_ctx; // overwrite the audio context size (0 = use default)
|
296
|
-
|
297
|
-
// tokens to provide to the whisper decoder as initial prompt
|
298
|
-
// these are prepended to any existing text context from a previous call
|
299
|
-
const whisper_token * prompt_tokens;
|
300
|
-
int prompt_n_tokens;
|
301
|
-
|
302
|
-
// for auto-detection, set to nullptr, "" or "auto"
|
303
|
-
const char * language;
|
304
|
-
|
305
|
-
// common decoding parameters:
|
306
|
-
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
307
|
-
bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
308
|
-
|
309
|
-
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
|
310
|
-
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
311
|
-
float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
|
312
|
-
|
313
|
-
// fallback parameters
|
314
|
-
// ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
|
315
|
-
float temperature_inc;
|
316
|
-
float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
|
317
|
-
float logprob_thold;
|
318
|
-
float no_speech_thold; // TODO: not implemented
|
319
|
-
|
320
|
-
struct {
|
321
|
-
int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
|
322
|
-
} greedy;
|
323
|
-
|
324
|
-
struct {
|
325
|
-
int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
|
326
|
-
|
327
|
-
float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
|
328
|
-
} beam_search;
|
329
|
-
|
330
|
-
// called for every newly generated text segment
|
331
|
-
whisper_new_segment_callback new_segment_callback;
|
332
|
-
void * new_segment_callback_user_data;
|
333
|
-
|
334
|
-
// called each time before the encoder starts
|
335
|
-
whisper_encoder_begin_callback encoder_begin_callback;
|
336
|
-
void * encoder_begin_callback_user_data;
|
337
|
-
|
338
|
-
// called by each decoder to filter obtained logits
|
339
|
-
whisper_logits_filter_callback logits_filter_callback;
|
340
|
-
void * logits_filter_callback_user_data;
|
341
|
-
};
|
342
|
-
|
343
|
-
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
|
344
|
-
|
345
|
-
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
346
|
-
// Uses the specified decoding strategy to obtain the text.
|
347
|
-
WHISPER_API int whisper_full(
|
348
|
-
struct whisper_context * ctx,
|
349
|
-
struct whisper_full_params params,
|
350
|
-
const float * samples,
|
351
|
-
int n_samples);
|
352
|
-
|
353
|
-
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
354
|
-
// It seems this approach can offer some speedup in some cases.
|
355
|
-
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
356
|
-
WHISPER_API int whisper_full_parallel(
|
357
|
-
struct whisper_context * ctx,
|
358
|
-
struct whisper_full_params params,
|
359
|
-
const float * samples,
|
360
|
-
int n_samples,
|
361
|
-
int n_processors);
|
362
|
-
|
363
|
-
// Number of generated text segments.
|
364
|
-
// A segment can be a few words, a sentence, or even a paragraph.
|
365
|
-
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
|
366
|
-
|
367
|
-
// Language id associated with the current context
|
368
|
-
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
|
369
|
-
|
370
|
-
// Get the start and end time of the specified segment.
|
371
|
-
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
|
372
|
-
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
|
373
|
-
|
374
|
-
// Get the text of the specified segment.
|
375
|
-
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
|
376
|
-
|
377
|
-
// Get number of tokens in the specified segment.
|
378
|
-
WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
|
379
|
-
|
380
|
-
// Get the token text of the specified token in the specified segment.
|
381
|
-
WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
|
382
|
-
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
|
383
|
-
|
384
|
-
// Get token data for the specified token in the specified segment.
|
385
|
-
// This contains probabilities, timestamps, etc.
|
386
|
-
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
|
387
|
-
|
388
|
-
// Get the probability of the specified token in the specified segment.
|
389
|
-
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
390
|
-
|
391
|
-
////////////////////////////////////////////////////////////////////////////
|
392
|
-
|
393
|
-
// Temporary helpers needed for exposing ggml interface
|
394
|
-
|
395
|
-
WHISPER_API int whisper_bench_memcpy(int n_threads);
|
396
|
-
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
|
397
|
-
|
398
|
-
#ifdef __cplusplus
|
399
|
-
}
|
400
|
-
#endif
|
401
|
-
|
402
|
-
#endif
|