whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
data/ext/whisper.h DELETED
@@ -1,402 +0,0 @@
1
- #ifndef WHISPER_H
2
- #define WHISPER_H
3
-
4
- #include <stddef.h>
5
- #include <stdint.h>
6
- #include <stdbool.h>
7
-
8
- #ifdef WHISPER_SHARED
9
- # ifdef _WIN32
10
- # ifdef WHISPER_BUILD
11
- # define WHISPER_API __declspec(dllexport)
12
- # else
13
- # define WHISPER_API __declspec(dllimport)
14
- # endif
15
- # else
16
- # define WHISPER_API __attribute__ ((visibility ("default")))
17
- # endif
18
- #else
19
- # define WHISPER_API
20
- #endif
21
-
22
- #define WHISPER_SAMPLE_RATE 16000
23
- #define WHISPER_N_FFT 400
24
- #define WHISPER_N_MEL 80
25
- #define WHISPER_HOP_LENGTH 160
26
- #define WHISPER_CHUNK_SIZE 30
27
-
28
- #ifdef __cplusplus
29
- extern "C" {
30
- #endif
31
-
32
- //
33
- // C interface
34
- //
35
- // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
36
- // concurrently.
37
- //
38
- // Basic usage:
39
- //
40
- // #include "whisper.h"
41
- //
42
- // ...
43
- //
44
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
45
- //
46
- // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
47
- // fprintf(stderr, "failed to process audio\n");
48
- // return 7;
49
- // }
50
- //
51
- // const int n_segments = whisper_full_n_segments(ctx);
52
- // for (int i = 0; i < n_segments; ++i) {
53
- // const char * text = whisper_full_get_segment_text(ctx, i);
54
- // printf("%s", text);
55
- // }
56
- //
57
- // whisper_free(ctx);
58
- //
59
- // ...
60
- //
61
- // This is a demonstration of the most straightforward usage of the library.
62
- // "pcmf32" contains the RAW audio data in 32-bit floating point format.
63
- //
64
- // The interface also allows for more fine-grained control over the computation, but it requires a deeper
65
- // understanding of how the model works.
66
- //
67
-
68
- struct whisper_context;
69
-
70
- typedef int whisper_token;
71
-
72
- typedef struct whisper_token_data {
73
- whisper_token id; // token id
74
- whisper_token tid; // forced timestamp token id
75
-
76
- float p; // probability of the token
77
- float plog; // log probability of the token
78
- float pt; // probability of the timestamp token
79
- float ptsum; // sum of probabilities of all timestamp tokens
80
-
81
- // token-level timestamp data
82
- // do not use if you haven't computed token-level timestamps
83
- int64_t t0; // start time of the token
84
- int64_t t1; // end time of the token
85
-
86
- float vlen; // voice length of the token
87
- } whisper_token_data;
88
-
89
- typedef struct whisper_model_loader {
90
- void * context;
91
-
92
- size_t (*read)(void * ctx, void * output, size_t read_size);
93
- bool (*eof)(void * ctx);
94
- void (*close)(void * ctx);
95
- } whisper_model_loader;
96
-
97
- // Various functions for loading a ggml whisper model.
98
- // Allocate (almost) all memory needed for the model.
99
- // Return NULL on failure
100
- WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
101
- WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
102
- WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
103
-
104
- // Frees all memory allocated by the model.
105
- WHISPER_API void whisper_free(struct whisper_context * ctx);
106
-
107
- // Convert RAW PCM audio to log mel spectrogram.
108
- // The resulting spectrogram is stored inside the provided whisper context.
109
- // Returns 0 on success
110
- WHISPER_API int whisper_pcm_to_mel(
111
- struct whisper_context * ctx,
112
- const float * samples,
113
- int n_samples,
114
- int n_threads);
115
-
116
- // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
117
- // The resulting spectrogram is stored inside the provided whisper context.
118
- // Returns 0 on success
119
- WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
120
- struct whisper_context* ctx,
121
- const float* samples,
122
- int n_samples,
123
- int n_threads);
124
-
125
-
126
- // This can be used to set a custom log mel spectrogram inside the provided whisper context.
127
- // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
128
- // n_mel must be 80
129
- // Returns 0 on success
130
- WHISPER_API int whisper_set_mel(
131
- struct whisper_context * ctx,
132
- const float * data,
133
- int n_len,
134
- int n_mel);
135
-
136
- // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
137
- // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
138
- // offset can be used to specify the offset of the first frame in the spectrogram.
139
- // Returns 0 on success
140
- WHISPER_API int whisper_encode(
141
- struct whisper_context * ctx,
142
- int offset,
143
- int n_threads);
144
-
145
- // Run the Whisper decoder to obtain the logits and probabilities for the next token.
146
- // Make sure to call whisper_encode() first.
147
- // tokens + n_tokens is the provided context for the decoder.
148
- // n_past is the number of tokens to use from previous decoder calls.
149
- // Returns 0 on success
150
- // TODO: add support for multiple decoders
151
- WHISPER_API int whisper_decode(
152
- struct whisper_context * ctx,
153
- const whisper_token * tokens,
154
- int n_tokens,
155
- int n_past,
156
- int n_threads);
157
-
158
- // Convert the provided text into tokens.
159
- // The tokens pointer must be large enough to hold the resulting tokens.
160
- // Returns the number of tokens on success, no more than n_max_tokens
161
- // Returns -1 on failure
162
- // TODO: not sure if correct
163
- WHISPER_API int whisper_tokenize(
164
- struct whisper_context * ctx,
165
- const char * text,
166
- whisper_token * tokens,
167
- int n_max_tokens);
168
-
169
- // Largest language id (i.e. number of available languages - 1)
170
- WHISPER_API int whisper_lang_max_id();
171
-
172
- // Return the id of the specified language, returns -1 if not found
173
- // Examples:
174
- // "de" -> 2
175
- // "german" -> 2
176
- WHISPER_API int whisper_lang_id(const char * lang);
177
-
178
- // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
179
- WHISPER_API const char * whisper_lang_str(int id);
180
-
181
- // Use mel data at offset_ms to try and auto-detect the spoken language
182
- // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
183
- // Returns the top language id or negative on failure
184
- // If not null, fills the lang_probs array with the probabilities of all languages
185
- // The array must be whispe_lang_max_id() + 1 in size
186
- // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
187
- WHISPER_API int whisper_lang_auto_detect(
188
- struct whisper_context * ctx,
189
- int offset_ms,
190
- int n_threads,
191
- float * lang_probs);
192
-
193
- WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
194
- WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
195
- WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
196
- WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
197
- WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
198
-
199
- // Token logits obtained from the last call to whisper_decode()
200
- // The logits for the last token are stored in the last row
201
- // Rows: n_tokens
202
- // Cols: n_vocab
203
- WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
204
-
205
- // Token Id -> String. Uses the vocabulary in the provided context
206
- WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
207
-
208
- // Special tokens
209
- WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
210
- WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
211
- WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
212
- WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
213
- WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
214
- WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
215
- WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
216
-
217
- // Task tokens
218
- WHISPER_API whisper_token whisper_token_translate (void);
219
- WHISPER_API whisper_token whisper_token_transcribe(void);
220
-
221
- // Performance information
222
- WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
223
- WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
224
-
225
- // Print system information
226
- WHISPER_API const char * whisper_print_system_info(void);
227
-
228
- // Abort a running whisper_full_parallel or whisper_full
229
- WHISPER_API void whisper_running_abort(struct whisper_context * ctx);
230
-
231
- // Resume whisper context from an aborted state allowing it run again
232
- WHISPER_API void whisper_running_restore(struct whisper_context * ctx);
233
-
234
- // Check the whisper context state if true then it can run if false it can not
235
- WHISPER_API bool whisper_running_state(struct whisper_context * ctx);
236
-
237
- ////////////////////////////////////////////////////////////////////////////
238
-
239
- // Available sampling strategies
240
- enum whisper_sampling_strategy {
241
- WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder
242
- WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
243
- };
244
-
245
- // Text segment callback
246
- // Called on every newly generated text segment
247
- // Use the whisper_full_...() functions to obtain the text segments
248
- typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
249
-
250
- // Encoder begin callback
251
- // If not NULL, called before the encoder starts
252
- // If it returns false, the computation is aborted
253
- typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
254
-
255
- // Logits filter callback
256
- // Can be used to modify the logits before sampling
257
- // If not NULL, called after applying temperature to logits
258
- typedef void (*whisper_logits_filter_callback)(
259
- struct whisper_context * ctx,
260
- const whisper_token_data * tokens,
261
- int n_tokens,
262
- float * logits,
263
- void * user_data);
264
-
265
- // Parameters for the whisper_full() function
266
- // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
267
- // whisper_full_default_params()
268
- struct whisper_full_params {
269
- enum whisper_sampling_strategy strategy;
270
-
271
- int n_threads;
272
- int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
273
- int offset_ms; // start offset in ms
274
- int duration_ms; // audio duration to process in ms
275
-
276
- bool translate;
277
- bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
278
- bool single_segment; // force single segment output (useful for streaming)
279
- bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
280
- bool print_progress; // print progress information
281
- bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
282
- bool print_timestamps; // print timestamps for each text segment when printing realtime
283
-
284
- // [EXPERIMENTAL] token-level timestamps
285
- bool token_timestamps; // enable token-level timestamps
286
- float thold_pt; // timestamp token probability threshold (~0.01)
287
- float thold_ptsum; // timestamp token sum probability threshold (~0.01)
288
- int max_len; // max segment length in characters
289
- bool split_on_word; // split on word rather than on token (when used with max_len)
290
- int max_tokens; // max tokens per segment (0 = no limit)
291
-
292
- // [EXPERIMENTAL] speed-up techniques
293
- // note: these can significantly reduce the quality of the output
294
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
295
- int audio_ctx; // overwrite the audio context size (0 = use default)
296
-
297
- // tokens to provide to the whisper decoder as initial prompt
298
- // these are prepended to any existing text context from a previous call
299
- const whisper_token * prompt_tokens;
300
- int prompt_n_tokens;
301
-
302
- // for auto-detection, set to nullptr, "" or "auto"
303
- const char * language;
304
-
305
- // common decoding parameters:
306
- bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
307
- bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
308
-
309
- float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
310
- float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
311
- float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
312
-
313
- // fallback parameters
314
- // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
315
- float temperature_inc;
316
- float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
317
- float logprob_thold;
318
- float no_speech_thold; // TODO: not implemented
319
-
320
- struct {
321
- int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
322
- } greedy;
323
-
324
- struct {
325
- int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
326
-
327
- float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
328
- } beam_search;
329
-
330
- // called for every newly generated text segment
331
- whisper_new_segment_callback new_segment_callback;
332
- void * new_segment_callback_user_data;
333
-
334
- // called each time before the encoder starts
335
- whisper_encoder_begin_callback encoder_begin_callback;
336
- void * encoder_begin_callback_user_data;
337
-
338
- // called by each decoder to filter obtained logits
339
- whisper_logits_filter_callback logits_filter_callback;
340
- void * logits_filter_callback_user_data;
341
- };
342
-
343
- WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
344
-
345
- // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
346
- // Uses the specified decoding strategy to obtain the text.
347
- WHISPER_API int whisper_full(
348
- struct whisper_context * ctx,
349
- struct whisper_full_params params,
350
- const float * samples,
351
- int n_samples);
352
-
353
- // Split the input audio in chunks and process each chunk separately using whisper_full()
354
- // It seems this approach can offer some speedup in some cases.
355
- // However, the transcription accuracy can be worse at the beginning and end of each chunk.
356
- WHISPER_API int whisper_full_parallel(
357
- struct whisper_context * ctx,
358
- struct whisper_full_params params,
359
- const float * samples,
360
- int n_samples,
361
- int n_processors);
362
-
363
- // Number of generated text segments.
364
- // A segment can be a few words, a sentence, or even a paragraph.
365
- WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
366
-
367
- // Language id associated with the current context
368
- WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
369
-
370
- // Get the start and end time of the specified segment.
371
- WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
372
- WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
373
-
374
- // Get the text of the specified segment.
375
- WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
376
-
377
- // Get number of tokens in the specified segment.
378
- WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
379
-
380
- // Get the token text of the specified token in the specified segment.
381
- WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
382
- WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
383
-
384
- // Get token data for the specified token in the specified segment.
385
- // This contains probabilities, timestamps, etc.
386
- WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
387
-
388
- // Get the probability of the specified token in the specified segment.
389
- WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
390
-
391
- ////////////////////////////////////////////////////////////////////////////
392
-
393
- // Temporary helpers needed for exposing ggml interface
394
-
395
- WHISPER_API int whisper_bench_memcpy(int n_threads);
396
- WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
397
-
398
- #ifdef __cplusplus
399
- }
400
- #endif
401
-
402
- #endif