whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,672 @@
1
+ #ifndef WHISPER_H
2
+ #define WHISPER_H
3
+
4
+ #include "ggml.h"
5
+ #include "ggml-cpu.h"
6
+
7
+ #include <stddef.h>
8
+ #include <stdint.h>
9
+ #include <stdbool.h>
10
+
11
+ #ifdef __GNUC__
12
+ # define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
13
+ #elif defined(_MSC_VER)
14
+ # define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
15
+ #else
16
+ # define WHISPER_DEPRECATED(func, hint) func
17
+ #endif
18
+
19
+ #ifdef WHISPER_SHARED
20
+ # ifdef _WIN32
21
+ # ifdef WHISPER_BUILD
22
+ # define WHISPER_API __declspec(dllexport)
23
+ # else
24
+ # define WHISPER_API __declspec(dllimport)
25
+ # endif
26
+ # else
27
+ # define WHISPER_API __attribute__ ((visibility ("default")))
28
+ # endif
29
+ #else
30
+ # define WHISPER_API
31
+ #endif
32
+
33
+ #define WHISPER_SAMPLE_RATE 16000
34
+ #define WHISPER_N_FFT 400
35
+ #define WHISPER_HOP_LENGTH 160
36
+ #define WHISPER_CHUNK_SIZE 30
37
+
38
+ #ifdef __cplusplus
39
+ extern "C" {
40
+ #endif
41
+
42
+ //
43
+ // C interface
44
+ //
45
+ // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
46
+ // concurrently.
47
+ //
48
+ // Basic usage:
49
+ //
50
+ // #include "whisper.h"
51
+ //
52
+ // ...
53
+ //
54
+ // whisper_context_params cparams = whisper_context_default_params();
55
+ //
56
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
57
+ //
58
+ // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
59
+ // fprintf(stderr, "failed to process audio\n");
60
+ // return 7;
61
+ // }
62
+ //
63
+ // const int n_segments = whisper_full_n_segments(ctx);
64
+ // for (int i = 0; i < n_segments; ++i) {
65
+ // const char * text = whisper_full_get_segment_text(ctx, i);
66
+ // printf("%s", text);
67
+ // }
68
+ //
69
+ // whisper_free(ctx);
70
+ //
71
+ // ...
72
+ //
73
+ // This is a demonstration of the most straightforward usage of the library.
74
+ // "pcmf32" contains the RAW audio data in 32-bit floating point format.
75
+ //
76
+ // The interface also allows for more fine-grained control over the computation, but it requires a deeper
77
+ // understanding of how the model works.
78
+ //
79
+
80
+ struct whisper_context;
81
+ struct whisper_state;
82
+ struct whisper_full_params;
83
+
84
+ typedef int32_t whisper_pos;
85
+ typedef int32_t whisper_token;
86
+ typedef int32_t whisper_seq_id;
87
+
88
+ enum whisper_alignment_heads_preset {
89
+ WHISPER_AHEADS_NONE,
90
+ WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers
91
+ WHISPER_AHEADS_CUSTOM,
92
+ WHISPER_AHEADS_TINY_EN,
93
+ WHISPER_AHEADS_TINY,
94
+ WHISPER_AHEADS_BASE_EN,
95
+ WHISPER_AHEADS_BASE,
96
+ WHISPER_AHEADS_SMALL_EN,
97
+ WHISPER_AHEADS_SMALL,
98
+ WHISPER_AHEADS_MEDIUM_EN,
99
+ WHISPER_AHEADS_MEDIUM,
100
+ WHISPER_AHEADS_LARGE_V1,
101
+ WHISPER_AHEADS_LARGE_V2,
102
+ WHISPER_AHEADS_LARGE_V3,
103
+ WHISPER_AHEADS_LARGE_V3_TURBO,
104
+ };
105
+
106
+ typedef struct whisper_ahead {
107
+ int n_text_layer;
108
+ int n_head;
109
+ } whisper_ahead;
110
+
111
+ typedef struct whisper_aheads {
112
+ size_t n_heads;
113
+ const whisper_ahead * heads;
114
+ } whisper_aheads;
115
+
116
+ struct whisper_context_params {
117
+ bool use_gpu;
118
+ bool flash_attn;
119
+ int gpu_device; // CUDA device
120
+
121
+ // [EXPERIMENTAL] Token-level timestamps with DTW
122
+ bool dtw_token_timestamps;
123
+ enum whisper_alignment_heads_preset dtw_aheads_preset;
124
+
125
+ int dtw_n_top;
126
+ struct whisper_aheads dtw_aheads;
127
+
128
+ size_t dtw_mem_size; // TODO: remove
129
+ };
130
+
131
+ typedef struct whisper_token_data {
132
+ whisper_token id; // token id
133
+ whisper_token tid; // forced timestamp token id
134
+
135
+ float p; // probability of the token
136
+ float plog; // log probability of the token
137
+ float pt; // probability of the timestamp token
138
+ float ptsum; // sum of probabilities of all timestamp tokens
139
+
140
+ // token-level timestamp data
141
+ // do not use if you haven't computed token-level timestamps
142
+ int64_t t0; // start time of the token
143
+ int64_t t1; // end time of the token
144
+
145
+ // [EXPERIMENTAL] Token-level timestamps with DTW
146
+ // do not use if you haven't computed token-level timestamps with dtw
147
+ // Roughly corresponds to the moment in audio in which the token was output
148
+ int64_t t_dtw;
149
+
150
+ float vlen; // voice length of the token
151
+ } whisper_token_data;
152
+
153
+ typedef struct whisper_model_loader {
154
+ void * context;
155
+
156
+ size_t (*read)(void * ctx, void * output, size_t read_size);
157
+ bool (*eof)(void * ctx);
158
+ void (*close)(void * ctx);
159
+ } whisper_model_loader;
160
+
161
+ // grammar element type
162
+ enum whisper_gretype {
163
+ // end of rule definition
164
+ WHISPER_GRETYPE_END = 0,
165
+
166
+ // start of alternate definition for rule
167
+ WHISPER_GRETYPE_ALT = 1,
168
+
169
+ // non-terminal element: reference to rule
170
+ WHISPER_GRETYPE_RULE_REF = 2,
171
+
172
+ // terminal element: character (code point)
173
+ WHISPER_GRETYPE_CHAR = 3,
174
+
175
+ // inverse char(s) ([^a], [^a-b] [^abc])
176
+ WHISPER_GRETYPE_CHAR_NOT = 4,
177
+
178
+ // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
179
+ // be an inclusive range ([a-z])
180
+ WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
181
+
182
+ // modifies a preceding WHISPER_GRETYPE_CHAR or
183
+ // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
184
+ WHISPER_GRETYPE_CHAR_ALT = 6,
185
+ };
186
+
187
+ typedef struct whisper_grammar_element {
188
+ enum whisper_gretype type;
189
+ uint32_t value; // Unicode code point or rule ID
190
+ } whisper_grammar_element;
191
+
192
+ // Various functions for loading a ggml whisper model.
193
+ // Allocate (almost) all memory needed for the model.
194
+ // Return NULL on failure
195
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params);
196
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
197
+ WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params);
198
+
199
+ // These are the same as the above, but the internal state of the context is not allocated automatically
200
+ // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
201
+ WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params);
202
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
203
+ WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params);
204
+
205
+ WHISPER_DEPRECATED(
206
+ WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
207
+ "use whisper_init_from_file_with_params instead"
208
+ );
209
+ WHISPER_DEPRECATED(
210
+ WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
211
+ "use whisper_init_from_buffer_with_params instead"
212
+ );
213
+ WHISPER_DEPRECATED(
214
+ WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
215
+ "use whisper_init_with_params instead"
216
+ );
217
+ WHISPER_DEPRECATED(
218
+ WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
219
+ "use whisper_init_from_file_with_params_no_state instead"
220
+ );
221
+ WHISPER_DEPRECATED(
222
+ WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
223
+ "use whisper_init_from_buffer_with_params_no_state instead"
224
+ );
225
+ WHISPER_DEPRECATED(
226
+ WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
227
+ "use whisper_init_with_params_no_state instead"
228
+ );
229
+
230
+ WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
231
+
232
+ // Given a context, enable use of OpenVINO for encode inference.
233
+ // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
234
+ // the path will be generated from the ggml model path that was passed
235
+ // in to whisper_init_from_file. For example, if 'path_model' was
236
+ // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
237
+ // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
238
+ // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
239
+ // cache_dir: Optional cache directory that can speed up init time, especially for
240
+ // GPU, by caching compiled 'blobs' there.
241
+ // Set to nullptr if not used.
242
+ // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
243
+ WHISPER_API int whisper_ctx_init_openvino_encoder_with_state(
244
+ struct whisper_context * ctx,
245
+ struct whisper_state * state,
246
+ const char * model_path,
247
+ const char * device,
248
+ const char * cache_dir);
249
+
250
+ WHISPER_API int whisper_ctx_init_openvino_encoder(
251
+ struct whisper_context * ctx,
252
+ const char * model_path,
253
+ const char * device,
254
+ const char * cache_dir);
255
+
256
+ // Frees all allocated memory
257
+ WHISPER_API void whisper_free (struct whisper_context * ctx);
258
+ WHISPER_API void whisper_free_state(struct whisper_state * state);
259
+ WHISPER_API void whisper_free_params(struct whisper_full_params * params);
260
+ WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
261
+
262
+ // Convert RAW PCM audio to log mel spectrogram.
263
+ // The resulting spectrogram is stored inside the default state of the provided whisper context.
264
+ // Returns 0 on success
265
+ WHISPER_API int whisper_pcm_to_mel(
266
+ struct whisper_context * ctx,
267
+ const float * samples,
268
+ int n_samples,
269
+ int n_threads);
270
+
271
+ WHISPER_API int whisper_pcm_to_mel_with_state(
272
+ struct whisper_context * ctx,
273
+ struct whisper_state * state,
274
+ const float * samples,
275
+ int n_samples,
276
+ int n_threads);
277
+
278
+ // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
279
+ // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
280
+ // n_mel must be 80
281
+ // Returns 0 on success
282
+ WHISPER_API int whisper_set_mel(
283
+ struct whisper_context * ctx,
284
+ const float * data,
285
+ int n_len,
286
+ int n_mel);
287
+
288
+ WHISPER_API int whisper_set_mel_with_state(
289
+ struct whisper_context * ctx,
290
+ struct whisper_state * state,
291
+ const float * data,
292
+ int n_len,
293
+ int n_mel);
294
+
295
+ // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
296
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
297
+ // offset can be used to specify the offset of the first frame in the spectrogram.
298
+ // Returns 0 on success
299
+ WHISPER_API int whisper_encode(
300
+ struct whisper_context * ctx,
301
+ int offset,
302
+ int n_threads);
303
+
304
+ WHISPER_API int whisper_encode_with_state(
305
+ struct whisper_context * ctx,
306
+ struct whisper_state * state,
307
+ int offset,
308
+ int n_threads);
309
+
310
+ // Run the Whisper decoder to obtain the logits and probabilities for the next token.
311
+ // Make sure to call whisper_encode() first.
312
+ // tokens + n_tokens is the provided context for the decoder.
313
+ // n_past is the number of tokens to use from previous decoder calls.
314
+ // Returns 0 on success
315
+ // TODO: add support for multiple decoders
316
+ WHISPER_API int whisper_decode(
317
+ struct whisper_context * ctx,
318
+ const whisper_token * tokens,
319
+ int n_tokens,
320
+ int n_past,
321
+ int n_threads);
322
+
323
+ WHISPER_API int whisper_decode_with_state(
324
+ struct whisper_context * ctx,
325
+ struct whisper_state * state,
326
+ const whisper_token * tokens,
327
+ int n_tokens,
328
+ int n_past,
329
+ int n_threads);
330
+
331
+ // Convert the provided text into tokens.
332
+ // The tokens pointer must be large enough to hold the resulting tokens.
333
+ // Returns the number of tokens on success, no more than n_max_tokens
334
+ // Returns a negative number on failure - the number of tokens that would have been returned
335
+ // TODO: not sure if correct
336
+ WHISPER_API int whisper_tokenize(
337
+ struct whisper_context * ctx,
338
+ const char * text,
339
+ whisper_token * tokens,
340
+ int n_max_tokens);
341
+
342
+ // Return the number of tokens in the provided text
343
+ // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
344
+ int whisper_token_count(struct whisper_context * ctx, const char * text);
345
+
346
+ // Largest language id (i.e. number of available languages - 1)
347
+ WHISPER_API int whisper_lang_max_id(void);
348
+
349
+ // Return the id of the specified language, returns -1 if not found
350
+ // Examples:
351
+ // "de" -> 2
352
+ // "german" -> 2
353
+ WHISPER_API int whisper_lang_id(const char * lang);
354
+
355
+ // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
356
+ WHISPER_API const char * whisper_lang_str(int id);
357
+
358
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
359
+ WHISPER_API const char * whisper_lang_str_full(int id);
360
+
361
+ // Use mel data at offset_ms to try and auto-detect the spoken language
362
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
363
+ // Returns the top language id or negative on failure
364
+ // If not null, fills the lang_probs array with the probabilities of all languages
365
+ // The array must be whisper_lang_max_id() + 1 in size
366
+ // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
367
+ WHISPER_API int whisper_lang_auto_detect(
368
+ struct whisper_context * ctx,
369
+ int offset_ms,
370
+ int n_threads,
371
+ float * lang_probs);
372
+
373
+ WHISPER_API int whisper_lang_auto_detect_with_state(
374
+ struct whisper_context * ctx,
375
+ struct whisper_state * state,
376
+ int offset_ms,
377
+ int n_threads,
378
+ float * lang_probs);
379
+
380
+ WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
381
+ WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
382
+ WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
383
+ WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
384
+ WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
385
+ WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
386
+
387
+ WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
388
+ WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
389
+ WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
390
+ WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
391
+ WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
392
+ WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
393
+ WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
394
+ WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
395
+ WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
396
+ WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
397
+ WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
398
+ WHISPER_API int whisper_model_type (struct whisper_context * ctx);
399
+
400
+ // Token logits obtained from the last call to whisper_decode()
401
+ // The logits for the last token are stored in the last row
402
+ // Rows: n_tokens
403
+ // Cols: n_vocab
404
+ WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
405
+ WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
406
+
407
+ // Token Id -> String. Uses the vocabulary in the provided context
408
+ WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
409
+ WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
410
+
411
+
412
+ // Special tokens
413
+ WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
414
+ WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
415
+ WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
416
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
417
+ WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
418
+ WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
419
+ WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
420
+ WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
421
+
422
+ // Task tokens
423
+ WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
424
+ WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
425
+
426
+ // Performance information from the default state.
427
+ struct whisper_timings {
428
+ float sample_ms;
429
+ float encode_ms;
430
+ float decode_ms;
431
+ float batchd_ms;
432
+ float prompt_ms;
433
+ };
434
+ WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx);
435
+ WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
436
+ WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
437
+
438
+ // Print system information
439
+ WHISPER_API const char * whisper_print_system_info(void);
440
+
441
+ ////////////////////////////////////////////////////////////////////////////
442
+
443
+ // Available sampling strategies
444
+ enum whisper_sampling_strategy {
445
+ WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
446
+ WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
447
+ };
448
+
449
+ // Text segment callback
450
+ // Called on every newly generated text segment
451
+ // Use the whisper_full_...() functions to obtain the text segments
452
+ typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
453
+
454
+ // Progress callback
455
+ typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
456
+
457
+ // Encoder begin callback
458
+ // If not NULL, called before the encoder starts
459
+ // If it returns false, the computation is aborted
460
+ typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
461
+
462
+ // Logits filter callback
463
+ // Can be used to modify the logits before sampling
464
+ // If not NULL, called after applying temperature to logits
465
+ typedef void (*whisper_logits_filter_callback)(
466
+ struct whisper_context * ctx,
467
+ struct whisper_state * state,
468
+ const whisper_token_data * tokens,
469
+ int n_tokens,
470
+ float * logits,
471
+ void * user_data);
472
+
473
+ // Parameters for the whisper_full() function
474
+ // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
475
+ // whisper_full_default_params()
476
+ struct whisper_full_params {
477
+ enum whisper_sampling_strategy strategy;
478
+
479
+ int n_threads;
480
+ int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
481
+ int offset_ms; // start offset in ms
482
+ int duration_ms; // audio duration to process in ms
483
+
484
+ bool translate;
485
+ bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
486
+ bool no_timestamps; // do not generate timestamps
487
+ bool single_segment; // force single segment output (useful for streaming)
488
+ bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
489
+ bool print_progress; // print progress information
490
+ bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
491
+ bool print_timestamps; // print timestamps for each text segment when printing realtime
492
+
493
+ // [EXPERIMENTAL] token-level timestamps
494
+ bool token_timestamps; // enable token-level timestamps
495
+ float thold_pt; // timestamp token probability threshold (~0.01)
496
+ float thold_ptsum; // timestamp token sum probability threshold (~0.01)
497
+ int max_len; // max segment length in characters
498
+ bool split_on_word; // split on word rather than on token (when used with max_len)
499
+ int max_tokens; // max tokens per segment (0 = no limit)
500
+
501
+ // [EXPERIMENTAL] speed-up techniques
502
+ // note: these can significantly reduce the quality of the output
503
+ bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
504
+ int audio_ctx; // overwrite the audio context size (0 = use default)
505
+
506
+ // [EXPERIMENTAL] [TDRZ] tinydiarize
507
+ bool tdrz_enable; // enable tinydiarize speaker turn detection
508
+
509
+ // A regular expression that matches tokens to suppress
510
+ const char * suppress_regex;
511
+
512
+ // tokens to provide to the whisper decoder as initial prompt
513
+ // these are prepended to any existing text context from a previous call
514
+ // use whisper_tokenize() to convert text to tokens
515
+ // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
516
+ const char * initial_prompt;
517
+ const whisper_token * prompt_tokens;
518
+ int prompt_n_tokens;
519
+
520
+ // for auto-detection, set to nullptr, "" or "auto"
521
+ const char * language;
522
+ bool detect_language;
523
+
524
+ // common decoding parameters:
525
+ bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
526
+ bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
527
+
528
+ float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
529
+ float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
530
+ float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
531
+
532
+ // fallback parameters
533
+ // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
534
+ float temperature_inc;
535
+ float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
536
+ float logprob_thold;
537
+ float no_speech_thold;
538
+
539
+ struct {
540
+ int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
541
+ } greedy;
542
+
543
+ struct {
544
+ int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
545
+
546
+ float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
547
+ } beam_search;
548
+
549
+ // called for every newly generated text segment
550
+ whisper_new_segment_callback new_segment_callback;
551
+ void * new_segment_callback_user_data;
552
+
553
+ // called on each progress update
554
+ whisper_progress_callback progress_callback;
555
+ void * progress_callback_user_data;
556
+
557
+ // called each time before the encoder starts
558
+ whisper_encoder_begin_callback encoder_begin_callback;
559
+ void * encoder_begin_callback_user_data;
560
+
561
+ // called each time before ggml computation starts
562
+ ggml_abort_callback abort_callback;
563
+ void * abort_callback_user_data;
564
+
565
+ // called by each decoder to filter obtained logits
566
+ whisper_logits_filter_callback logits_filter_callback;
567
+ void * logits_filter_callback_user_data;
568
+
569
+ const whisper_grammar_element ** grammar_rules;
570
+ size_t n_grammar_rules;
571
+ size_t i_start_rule;
572
+ float grammar_penalty;
573
+ };
574
+
575
+ // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
576
+ WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void);
577
+ WHISPER_API struct whisper_context_params whisper_context_default_params (void);
578
+ WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
579
+ WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy);
580
+
581
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
582
+ // Not thread safe for same context
583
+ // Uses the specified decoding strategy to obtain the text.
584
+ WHISPER_API int whisper_full(
585
+ struct whisper_context * ctx,
586
+ struct whisper_full_params params,
587
+ const float * samples,
588
+ int n_samples);
589
+
590
+ WHISPER_API int whisper_full_with_state(
591
+ struct whisper_context * ctx,
592
+ struct whisper_state * state,
593
+ struct whisper_full_params params,
594
+ const float * samples,
595
+ int n_samples);
596
+
597
+ // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
598
+ // Result is stored in the default state of the context
599
+ // Not thread safe if executed in parallel on the same context.
600
+ // It seems this approach can offer some speedup in some cases.
601
+ // However, the transcription accuracy can be worse at the beginning and end of each chunk.
602
+ WHISPER_API int whisper_full_parallel(
603
+ struct whisper_context * ctx,
604
+ struct whisper_full_params params,
605
+ const float * samples,
606
+ int n_samples,
607
+ int n_processors);
608
+
609
+ // Number of generated text segments
610
+ // A segment can be a few words, a sentence, or even a paragraph.
611
+ WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
612
+ WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
613
+
614
+ // Language id associated with the context's default state
615
+ WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
616
+
617
+ // Language id associated with the provided state
618
+ WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
619
+
620
+ // Get the start and end time of the specified segment
621
+ WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
622
+ WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
623
+
624
+ WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
625
+ WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
626
+
627
+ // Get whether the next segment is predicted as a speaker turn
628
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
629
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
630
+
631
+ // Get the text of the specified segment
632
+ WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
633
+ WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
634
+
635
+ // Get number of tokens in the specified segment
636
+ WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
637
+ WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
638
+
639
+ // Get the token text of the specified token in the specified segment
640
+ WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
641
+ WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
642
+
643
+ WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
644
+ WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
645
+
646
+ // Get token data for the specified token in the specified segment
647
+ // This contains probabilities, timestamps, etc.
648
+ WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
649
+ WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
650
+
651
+ // Get the probability of the specified token in the specified segment
652
+ WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
653
+ WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
654
+
655
+ ////////////////////////////////////////////////////////////////////////////
656
+
657
+ // Temporary helpers needed for exposing ggml interface
658
+
659
+ WHISPER_API int whisper_bench_memcpy (int n_threads);
660
+ WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
661
+ WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
662
+ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
663
+
664
+ // Control logging output; default behavior is to print to stderr
665
+
666
+ WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
667
+
668
+ #ifdef __cplusplus
669
+ }
670
+ #endif
671
+
672
+ #endif