whispercpp 1.2.0.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/Rakefile +4 -19
 - data/ext/extconf.rb +9 -0
 - data/ext/ggml.c +18380 -5241
 - data/ext/ggml.h +2156 -502
 - data/ext/ruby_whisper.cpp +2 -2
 - data/ext/whisper.cpp +4184 -1774
 - data/ext/whisper.h +348 -56
 - metadata +3 -3
 
    
        data/ext/whisper.h
    CHANGED
    
    | 
         @@ -1,10 +1,20 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            #ifndef WHISPER_H
         
     | 
| 
       2 
2 
     | 
    
         
             
            #define WHISPER_H
         
     | 
| 
       3 
3 
     | 
    
         | 
| 
      
 4 
     | 
    
         
            +
            #include "ggml.h"
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
       4 
6 
     | 
    
         
             
            #include <stddef.h>
         
     | 
| 
       5 
7 
     | 
    
         
             
            #include <stdint.h>
         
     | 
| 
       6 
8 
     | 
    
         
             
            #include <stdbool.h>
         
     | 
| 
       7 
9 
     | 
    
         | 
| 
      
 10 
     | 
    
         
            +
            #ifdef __GNUC__
         
     | 
| 
      
 11 
     | 
    
         
            +
            #    define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
         
     | 
| 
      
 12 
     | 
    
         
            +
            #elif defined(_MSC_VER)
         
     | 
| 
      
 13 
     | 
    
         
            +
            #    define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
         
     | 
| 
      
 14 
     | 
    
         
            +
            #else
         
     | 
| 
      
 15 
     | 
    
         
            +
            #    define WHISPER_DEPRECATED(func, hint) func
         
     | 
| 
      
 16 
     | 
    
         
            +
            #endif
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
       8 
18 
     | 
    
         
             
            #ifdef WHISPER_SHARED
         
     | 
| 
       9 
19 
     | 
    
         
             
            #    ifdef _WIN32
         
     | 
| 
       10 
20 
     | 
    
         
             
            #        ifdef WHISPER_BUILD
         
     | 
| 
         @@ -21,7 +31,6 @@ 
     | 
|
| 
       21 
31 
     | 
    
         | 
| 
       22 
32 
     | 
    
         
             
            #define WHISPER_SAMPLE_RATE 16000
         
     | 
| 
       23 
33 
     | 
    
         
             
            #define WHISPER_N_FFT       400
         
     | 
| 
       24 
     | 
    
         
            -
            #define WHISPER_N_MEL       80
         
     | 
| 
       25 
34 
     | 
    
         
             
            #define WHISPER_HOP_LENGTH  160
         
     | 
| 
       26 
35 
     | 
    
         
             
            #define WHISPER_CHUNK_SIZE  30
         
     | 
| 
       27 
36 
     | 
    
         | 
| 
         @@ -41,7 +50,9 @@ extern "C" { 
     | 
|
| 
       41 
50 
     | 
    
         
             
                //
         
     | 
| 
       42 
51 
     | 
    
         
             
                //     ...
         
     | 
| 
       43 
52 
     | 
    
         
             
                //
         
     | 
| 
       44 
     | 
    
         
            -
                //      
     | 
| 
      
 53 
     | 
    
         
            +
                //     whisper_context_params cparams = whisper_context_default_params();
         
     | 
| 
      
 54 
     | 
    
         
            +
                //
         
     | 
| 
      
 55 
     | 
    
         
            +
                //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
         
     | 
| 
       45 
56 
     | 
    
         
             
                //
         
     | 
| 
       46 
57 
     | 
    
         
             
                //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
         
     | 
| 
       47 
58 
     | 
    
         
             
                //         fprintf(stderr, "failed to process audio\n");
         
     | 
| 
         @@ -66,8 +77,53 @@ extern "C" { 
     | 
|
| 
       66 
77 
     | 
    
         
             
                //
         
     | 
| 
       67 
78 
     | 
    
         | 
| 
       68 
79 
     | 
    
         
             
                struct whisper_context;
         
     | 
| 
      
 80 
     | 
    
         
            +
                struct whisper_state;
         
     | 
| 
      
 81 
     | 
    
         
            +
                struct whisper_full_params;
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                typedef int32_t whisper_pos;
         
     | 
| 
      
 84 
     | 
    
         
            +
                typedef int32_t whisper_token;
         
     | 
| 
      
 85 
     | 
    
         
            +
                typedef int32_t whisper_seq_id;
         
     | 
| 
      
 86 
     | 
    
         
            +
             
     | 
| 
      
 87 
     | 
    
         
            +
                enum whisper_alignment_heads_preset {
         
     | 
| 
      
 88 
     | 
    
         
            +
                    WHISPER_AHEADS_NONE,
         
     | 
| 
      
 89 
     | 
    
         
            +
                    WHISPER_AHEADS_N_TOP_MOST,  // All heads from the N-top-most text-layers
         
     | 
| 
      
 90 
     | 
    
         
            +
                    WHISPER_AHEADS_CUSTOM,
         
     | 
| 
      
 91 
     | 
    
         
            +
                    WHISPER_AHEADS_TINY_EN,
         
     | 
| 
      
 92 
     | 
    
         
            +
                    WHISPER_AHEADS_TINY,
         
     | 
| 
      
 93 
     | 
    
         
            +
                    WHISPER_AHEADS_BASE_EN,
         
     | 
| 
      
 94 
     | 
    
         
            +
                    WHISPER_AHEADS_BASE,
         
     | 
| 
      
 95 
     | 
    
         
            +
                    WHISPER_AHEADS_SMALL_EN,
         
     | 
| 
      
 96 
     | 
    
         
            +
                    WHISPER_AHEADS_SMALL,
         
     | 
| 
      
 97 
     | 
    
         
            +
                    WHISPER_AHEADS_MEDIUM_EN,
         
     | 
| 
      
 98 
     | 
    
         
            +
                    WHISPER_AHEADS_MEDIUM,
         
     | 
| 
      
 99 
     | 
    
         
            +
                    WHISPER_AHEADS_LARGE_V1,
         
     | 
| 
      
 100 
     | 
    
         
            +
                    WHISPER_AHEADS_LARGE_V2,
         
     | 
| 
      
 101 
     | 
    
         
            +
                    WHISPER_AHEADS_LARGE_V3,
         
     | 
| 
      
 102 
     | 
    
         
            +
                };
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                typedef struct whisper_ahead {
         
     | 
| 
      
 105 
     | 
    
         
            +
                    int n_text_layer;
         
     | 
| 
      
 106 
     | 
    
         
            +
                    int n_head;
         
     | 
| 
      
 107 
     | 
    
         
            +
                } whisper_ahead;
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                typedef struct whisper_aheads {
         
     | 
| 
      
 110 
     | 
    
         
            +
                    size_t n_heads;
         
     | 
| 
      
 111 
     | 
    
         
            +
                    const whisper_ahead * heads;
         
     | 
| 
      
 112 
     | 
    
         
            +
                } whisper_aheads;
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
                struct whisper_context_params {
         
     | 
| 
      
 115 
     | 
    
         
            +
                    bool  use_gpu;
         
     | 
| 
      
 116 
     | 
    
         
            +
                    int   gpu_device;  // CUDA device
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
                    // [EXPERIMENTAL] Token-level timestamps with DTW
         
     | 
| 
      
 119 
     | 
    
         
            +
                    bool dtw_token_timestamps;
         
     | 
| 
      
 120 
     | 
    
         
            +
                    enum whisper_alignment_heads_preset dtw_aheads_preset;
         
     | 
| 
       69 
121 
     | 
    
         | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
      
 122 
     | 
    
         
            +
                    int dtw_n_top;
         
     | 
| 
      
 123 
     | 
    
         
            +
                    struct whisper_aheads dtw_aheads;
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                    size_t dtw_mem_size; // TODO: remove
         
     | 
| 
      
 126 
     | 
    
         
            +
                };
         
     | 
| 
       71 
127 
     | 
    
         | 
| 
       72 
128 
     | 
    
         
             
                typedef struct whisper_token_data {
         
     | 
| 
       73 
129 
     | 
    
         
             
                    whisper_token id;  // token id
         
     | 
| 
         @@ -83,6 +139,11 @@ extern "C" { 
     | 
|
| 
       83 
139 
     | 
    
         
             
                    int64_t t0;        // start time of the token
         
     | 
| 
       84 
140 
     | 
    
         
             
                    int64_t t1;        //   end time of the token
         
     | 
| 
       85 
141 
     | 
    
         | 
| 
      
 142 
     | 
    
         
            +
                    // [EXPERIMENTAL] Token-level timestamps with DTW
         
     | 
| 
      
 143 
     | 
    
         
            +
                    // do not use if you haven't computed token-level timestamps with dtw
         
     | 
| 
      
 144 
     | 
    
         
            +
                    // Roughly corresponds to the moment in audio in which the token was output
         
     | 
| 
      
 145 
     | 
    
         
            +
                    int64_t t_dtw;
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
       86 
147 
     | 
    
         
             
                    float vlen;        // voice length of the token
         
     | 
| 
       87 
148 
     | 
    
         
             
                } whisper_token_data;
         
     | 
| 
       88 
149 
     | 
    
         | 
| 
         @@ -94,18 +155,102 @@ extern "C" { 
     | 
|
| 
       94 
155 
     | 
    
         
             
                    void  (*close)(void * ctx);
         
     | 
| 
       95 
156 
     | 
    
         
             
                } whisper_model_loader;
         
     | 
| 
       96 
157 
     | 
    
         | 
| 
      
 158 
     | 
    
         
            +
                // grammar element type
         
     | 
| 
      
 159 
     | 
    
         
            +
                enum whisper_gretype {
         
     | 
| 
      
 160 
     | 
    
         
            +
                    // end of rule definition
         
     | 
| 
      
 161 
     | 
    
         
            +
                    WHISPER_GRETYPE_END            = 0,
         
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
                    // start of alternate definition for rule
         
     | 
| 
      
 164 
     | 
    
         
            +
                    WHISPER_GRETYPE_ALT            = 1,
         
     | 
| 
      
 165 
     | 
    
         
            +
             
     | 
| 
      
 166 
     | 
    
         
            +
                    // non-terminal element: reference to rule
         
     | 
| 
      
 167 
     | 
    
         
            +
                    WHISPER_GRETYPE_RULE_REF       = 2,
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
                    // terminal element: character (code point)
         
     | 
| 
      
 170 
     | 
    
         
            +
                    WHISPER_GRETYPE_CHAR           = 3,
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                    // inverse char(s) ([^a], [^a-b] [^abc])
         
     | 
| 
      
 173 
     | 
    
         
            +
                    WHISPER_GRETYPE_CHAR_NOT       = 4,
         
     | 
| 
      
 174 
     | 
    
         
            +
             
     | 
| 
      
 175 
     | 
    
         
            +
                    // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
         
     | 
| 
      
 176 
     | 
    
         
            +
                    // be an inclusive range ([a-z])
         
     | 
| 
      
 177 
     | 
    
         
            +
                    WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
         
     | 
| 
      
 178 
     | 
    
         
            +
             
     | 
| 
      
 179 
     | 
    
         
            +
                    // modifies a preceding WHISPER_GRETYPE_CHAR or
         
     | 
| 
      
 180 
     | 
    
         
            +
                    // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
         
     | 
| 
      
 181 
     | 
    
         
            +
                    WHISPER_GRETYPE_CHAR_ALT       = 6,
         
     | 
| 
      
 182 
     | 
    
         
            +
                };
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
                typedef struct whisper_grammar_element {
         
     | 
| 
      
 185 
     | 
    
         
            +
                    enum whisper_gretype type;
         
     | 
| 
      
 186 
     | 
    
         
            +
                    uint32_t             value; // Unicode code point or rule ID
         
     | 
| 
      
 187 
     | 
    
         
            +
                } whisper_grammar_element;
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
       97 
189 
     | 
    
         
             
                // Various functions for loading a ggml whisper model.
         
     | 
| 
       98 
190 
     | 
    
         
             
                // Allocate (almost) all memory needed for the model.
         
     | 
| 
       99 
191 
     | 
    
         
             
                // Return NULL on failure
         
     | 
| 
       100 
     | 
    
         
            -
                WHISPER_API struct whisper_context *  
     | 
| 
       101 
     | 
    
         
            -
                WHISPER_API struct whisper_context *  
     | 
| 
       102 
     | 
    
         
            -
                WHISPER_API struct whisper_context *  
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
                //  
     | 
| 
       105 
     | 
    
         
            -
                 
     | 
| 
      
 192 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_from_file_with_params  (const char * path_model,              struct whisper_context_params params);
         
     | 
| 
      
 193 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,    struct whisper_context_params params);
         
     | 
| 
      
 194 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_with_params            (struct whisper_model_loader * loader, struct whisper_context_params params);
         
     | 
| 
      
 195 
     | 
    
         
            +
             
     | 
| 
      
 196 
     | 
    
         
            +
                // These are the same as the above, but the internal state of the context is not allocated automatically
         
     | 
| 
      
 197 
     | 
    
         
            +
                // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
         
     | 
| 
      
 198 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state  (const char * path_model,              struct whisper_context_params params);
         
     | 
| 
      
 199 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,    struct whisper_context_params params);
         
     | 
| 
      
 200 
     | 
    
         
            +
                WHISPER_API struct whisper_context * whisper_init_with_params_no_state            (struct whisper_model_loader * loader, struct whisper_context_params params);
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 203 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
         
     | 
| 
      
 204 
     | 
    
         
            +
                    "use whisper_init_from_file_with_params instead"
         
     | 
| 
      
 205 
     | 
    
         
            +
                );
         
     | 
| 
      
 206 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 207 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
         
     | 
| 
      
 208 
     | 
    
         
            +
                    "use whisper_init_from_buffer_with_params instead"
         
     | 
| 
      
 209 
     | 
    
         
            +
                );
         
     | 
| 
      
 210 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 211 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
         
     | 
| 
      
 212 
     | 
    
         
            +
                    "use whisper_init_with_params instead"
         
     | 
| 
      
 213 
     | 
    
         
            +
                );
         
     | 
| 
      
 214 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 215 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
         
     | 
| 
      
 216 
     | 
    
         
            +
                    "use whisper_init_from_file_with_params_no_state instead"
         
     | 
| 
      
 217 
     | 
    
         
            +
                );
         
     | 
| 
      
 218 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 219 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
         
     | 
| 
      
 220 
     | 
    
         
            +
                    "use whisper_init_from_buffer_with_params_no_state instead"
         
     | 
| 
      
 221 
     | 
    
         
            +
                );
         
     | 
| 
      
 222 
     | 
    
         
            +
                WHISPER_DEPRECATED(
         
     | 
| 
      
 223 
     | 
    
         
            +
                    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
         
     | 
| 
      
 224 
     | 
    
         
            +
                    "use whisper_init_with_params_no_state instead"
         
     | 
| 
      
 225 
     | 
    
         
            +
                );
         
     | 
| 
      
 226 
     | 
    
         
            +
             
     | 
| 
      
 227 
     | 
    
         
            +
                WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
         
     | 
| 
      
 228 
     | 
    
         
            +
             
     | 
| 
      
 229 
     | 
    
         
            +
                // Given a context, enable use of OpenVINO for encode inference.
         
     | 
| 
      
 230 
     | 
    
         
            +
                // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
         
     | 
| 
      
 231 
     | 
    
         
            +
                //                      the path will be generated from the ggml model path that was passed
         
     | 
| 
      
 232 
     | 
    
         
            +
                //                      in to whisper_init_from_file. For example, if 'path_model' was
         
     | 
| 
      
 233 
     | 
    
         
            +
                //                      "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
         
     | 
| 
      
 234 
     | 
    
         
            +
                //                      assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
         
     | 
| 
      
 235 
     | 
    
         
            +
                // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
         
     | 
| 
      
 236 
     | 
    
         
            +
                // cache_dir: Optional cache directory that can speed up init time, especially for
         
     | 
| 
      
 237 
     | 
    
         
            +
                //                     GPU, by caching compiled 'blobs' there.
         
     | 
| 
      
 238 
     | 
    
         
            +
                //                     Set to nullptr if not used.
         
     | 
| 
      
 239 
     | 
    
         
            +
                // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
         
     | 
| 
      
 240 
     | 
    
         
            +
                WHISPER_API int whisper_ctx_init_openvino_encoder(
         
     | 
| 
      
 241 
     | 
    
         
            +
                    struct whisper_context * ctx,
         
     | 
| 
      
 242 
     | 
    
         
            +
                                const char * model_path,
         
     | 
| 
      
 243 
     | 
    
         
            +
                                const char * device,
         
     | 
| 
      
 244 
     | 
    
         
            +
                                const char * cache_dir);
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     | 
| 
      
 246 
     | 
    
         
            +
                // Frees all allocated memory
         
     | 
| 
      
 247 
     | 
    
         
            +
                WHISPER_API void whisper_free      (struct whisper_context * ctx);
         
     | 
| 
      
 248 
     | 
    
         
            +
                WHISPER_API void whisper_free_state(struct whisper_state * state);
         
     | 
| 
      
 249 
     | 
    
         
            +
                WHISPER_API void whisper_free_params(struct whisper_full_params * params);
         
     | 
| 
      
 250 
     | 
    
         
            +
                WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
         
     | 
| 
       106 
251 
     | 
    
         | 
| 
       107 
252 
     | 
    
         
             
                // Convert RAW PCM audio to log mel spectrogram.
         
     | 
| 
       108 
     | 
    
         
            -
                // The resulting spectrogram is stored inside the provided whisper context.
         
     | 
| 
      
 253 
     | 
    
         
            +
                // The resulting spectrogram is stored inside the default state of the provided whisper context.
         
     | 
| 
       109 
254 
     | 
    
         
             
                // Returns 0 on success
         
     | 
| 
       110 
255 
     | 
    
         
             
                WHISPER_API int whisper_pcm_to_mel(
         
     | 
| 
       111 
256 
     | 
    
         
             
                        struct whisper_context * ctx,
         
     | 
| 
         @@ -113,17 +258,30 @@ extern "C" { 
     | 
|
| 
       113 
258 
     | 
    
         
             
                                           int   n_samples,
         
     | 
| 
       114 
259 
     | 
    
         
             
                                           int   n_threads);
         
     | 
| 
       115 
260 
     | 
    
         | 
| 
       116 
     | 
    
         
            -
                 
     | 
| 
       117 
     | 
    
         
            -
             
     | 
| 
      
 261 
     | 
    
         
            +
                WHISPER_API int whisper_pcm_to_mel_with_state(
         
     | 
| 
      
 262 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 263 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 264 
     | 
    
         
            +
                                   const float * samples,
         
     | 
| 
      
 265 
     | 
    
         
            +
                                           int   n_samples,
         
     | 
| 
      
 266 
     | 
    
         
            +
                                           int   n_threads);
         
     | 
| 
      
 267 
     | 
    
         
            +
             
     | 
| 
      
 268 
     | 
    
         
            +
                // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
         
     | 
| 
      
 269 
     | 
    
         
            +
                // The resulting spectrogram is stored inside the default state of the provided whisper context.
         
     | 
| 
       118 
270 
     | 
    
         
             
                // Returns 0 on success
         
     | 
| 
       119 
271 
     | 
    
         
             
                WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
         
     | 
| 
       120 
     | 
    
         
            -
                    struct whisper_context* ctx,
         
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
             
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
       124 
     | 
    
         
            -
             
     | 
| 
       125 
     | 
    
         
            -
             
     | 
| 
       126 
     | 
    
         
            -
             
     | 
| 
      
 272 
     | 
    
         
            +
                    struct whisper_context * ctx,
         
     | 
| 
      
 273 
     | 
    
         
            +
                               const float * samples,
         
     | 
| 
      
 274 
     | 
    
         
            +
                                       int   n_samples,
         
     | 
| 
      
 275 
     | 
    
         
            +
                                       int   n_threads);
         
     | 
| 
      
 276 
     | 
    
         
            +
             
     | 
| 
      
 277 
     | 
    
         
            +
                WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
         
     | 
| 
      
 278 
     | 
    
         
            +
                    struct whisper_context * ctx,
         
     | 
| 
      
 279 
     | 
    
         
            +
                      struct whisper_state * state,
         
     | 
| 
      
 280 
     | 
    
         
            +
                               const float * samples,
         
     | 
| 
      
 281 
     | 
    
         
            +
                                       int   n_samples,
         
     | 
| 
      
 282 
     | 
    
         
            +
                                       int   n_threads);
         
     | 
| 
      
 283 
     | 
    
         
            +
             
     | 
| 
      
 284 
     | 
    
         
            +
                // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
         
     | 
| 
       127 
285 
     | 
    
         
             
                // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
         
     | 
| 
       128 
286 
     | 
    
         
             
                // n_mel must be 80
         
     | 
| 
       129 
287 
     | 
    
         
             
                // Returns 0 on success
         
     | 
| 
         @@ -133,7 +291,14 @@ extern "C" { 
     | 
|
| 
       133 
291 
     | 
    
         
             
                                           int   n_len,
         
     | 
| 
       134 
292 
     | 
    
         
             
                                           int   n_mel);
         
     | 
| 
       135 
293 
     | 
    
         | 
| 
       136 
     | 
    
         
            -
                 
     | 
| 
      
 294 
     | 
    
         
            +
                WHISPER_API int whisper_set_mel_with_state(
         
     | 
| 
      
 295 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 296 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 297 
     | 
    
         
            +
                                   const float * data,
         
     | 
| 
      
 298 
     | 
    
         
            +
                                           int   n_len,
         
     | 
| 
      
 299 
     | 
    
         
            +
                                           int   n_mel);
         
     | 
| 
      
 300 
     | 
    
         
            +
             
     | 
| 
      
 301 
     | 
    
         
            +
                // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
         
     | 
| 
       137 
302 
     | 
    
         
             
                // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
         
     | 
| 
       138 
303 
     | 
    
         
             
                // offset can be used to specify the offset of the first frame in the spectrogram.
         
     | 
| 
       139 
304 
     | 
    
         
             
                // Returns 0 on success
         
     | 
| 
         @@ -142,6 +307,12 @@ extern "C" { 
     | 
|
| 
       142 
307 
     | 
    
         
             
                                           int   offset,
         
     | 
| 
       143 
308 
     | 
    
         
             
                                           int   n_threads);
         
     | 
| 
       144 
309 
     | 
    
         | 
| 
      
 310 
     | 
    
         
            +
                WHISPER_API int whisper_encode_with_state(
         
     | 
| 
      
 311 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 312 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 313 
     | 
    
         
            +
                                           int   offset,
         
     | 
| 
      
 314 
     | 
    
         
            +
                                           int   n_threads);
         
     | 
| 
      
 315 
     | 
    
         
            +
             
     | 
| 
       145 
316 
     | 
    
         
             
                // Run the Whisper decoder to obtain the logits and probabilities for the next token.
         
     | 
| 
       146 
317 
     | 
    
         
             
                // Make sure to call whisper_encode() first.
         
     | 
| 
       147 
318 
     | 
    
         
             
                // tokens + n_tokens is the provided context for the decoder.
         
     | 
| 
         @@ -155,10 +326,18 @@ extern "C" { 
     | 
|
| 
       155 
326 
     | 
    
         
             
                                           int   n_past,
         
     | 
| 
       156 
327 
     | 
    
         
             
                                           int   n_threads);
         
     | 
| 
       157 
328 
     | 
    
         | 
| 
      
 329 
     | 
    
         
            +
                WHISPER_API int whisper_decode_with_state(
         
     | 
| 
      
 330 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 331 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 332 
     | 
    
         
            +
                           const whisper_token * tokens,
         
     | 
| 
      
 333 
     | 
    
         
            +
                                           int   n_tokens,
         
     | 
| 
      
 334 
     | 
    
         
            +
                                           int   n_past,
         
     | 
| 
      
 335 
     | 
    
         
            +
                                           int   n_threads);
         
     | 
| 
      
 336 
     | 
    
         
            +
             
     | 
| 
       158 
337 
     | 
    
         
             
                // Convert the provided text into tokens.
         
     | 
| 
       159 
338 
     | 
    
         
             
                // The tokens pointer must be large enough to hold the resulting tokens.
         
     | 
| 
       160 
339 
     | 
    
         
             
                // Returns the number of tokens on success, no more than n_max_tokens
         
     | 
| 
       161 
     | 
    
         
            -
                // Returns  
     | 
| 
      
 340 
     | 
    
         
            +
                // Returns a negative number on failure - the number of tokens that would have been returned
         
     | 
| 
       162 
341 
     | 
    
         
             
                // TODO: not sure if correct
         
     | 
| 
       163 
342 
     | 
    
         
             
                WHISPER_API int whisper_tokenize(
         
     | 
| 
       164 
343 
     | 
    
         
             
                        struct whisper_context * ctx,
         
     | 
| 
         @@ -166,6 +345,10 @@ extern "C" { 
     | 
|
| 
       166 
345 
     | 
    
         
             
                                 whisper_token * tokens,
         
     | 
| 
       167 
346 
     | 
    
         
             
                                           int   n_max_tokens);
         
     | 
| 
       168 
347 
     | 
    
         | 
| 
      
 348 
     | 
    
         
            +
                // Return the number of tokens in the provided text
         
     | 
| 
      
 349 
     | 
    
         
            +
                // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
         
     | 
| 
      
 350 
     | 
    
         
            +
                int whisper_token_count(struct whisper_context * ctx, const char * text);
         
     | 
| 
      
 351 
     | 
    
         
            +
             
     | 
| 
       169 
352 
     | 
    
         
             
                // Largest language id (i.e. number of available languages - 1)
         
     | 
| 
       170 
353 
     | 
    
         
             
                WHISPER_API int whisper_lang_max_id();
         
     | 
| 
       171 
354 
     | 
    
         | 
| 
         @@ -178,11 +361,14 @@ extern "C" { 
     | 
|
| 
       178 
361 
     | 
    
         
             
                // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
         
     | 
| 
       179 
362 
     | 
    
         
             
                WHISPER_API const char * whisper_lang_str(int id);
         
     | 
| 
       180 
363 
     | 
    
         | 
| 
      
 364 
     | 
    
         
            +
                // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
         
     | 
| 
      
 365 
     | 
    
         
            +
                WHISPER_API const char * whisper_lang_str_full(int id);
         
     | 
| 
      
 366 
     | 
    
         
            +
             
     | 
| 
       181 
367 
     | 
    
         
             
                // Use mel data at offset_ms to try and auto-detect the spoken language
         
     | 
| 
       182 
368 
     | 
    
         
             
                // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
         
     | 
| 
       183 
369 
     | 
    
         
             
                // Returns the top language id or negative on failure
         
     | 
| 
       184 
370 
     | 
    
         
             
                // If not null, fills the lang_probs array with the probabilities of all languages
         
     | 
| 
       185 
     | 
    
         
            -
                // The array must be  
     | 
| 
      
 371 
     | 
    
         
            +
                // The array must be whisper_lang_max_id() + 1 in size
         
     | 
| 
       186 
372 
     | 
    
         
             
                // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
         
     | 
| 
       187 
373 
     | 
    
         
             
                WHISPER_API int whisper_lang_auto_detect(
         
     | 
| 
       188 
374 
     | 
    
         
             
                        struct whisper_context * ctx,
         
     | 
| 
         @@ -190,35 +376,60 @@ extern "C" { 
     | 
|
| 
       190 
376 
     | 
    
         
             
                                           int   n_threads,
         
     | 
| 
       191 
377 
     | 
    
         
             
                                         float * lang_probs);
         
     | 
| 
       192 
378 
     | 
    
         | 
| 
       193 
     | 
    
         
            -
                WHISPER_API int  
     | 
| 
       194 
     | 
    
         
            -
             
     | 
| 
       195 
     | 
    
         
            -
             
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
      
 379 
     | 
    
         
            +
                WHISPER_API int whisper_lang_auto_detect_with_state(
         
     | 
| 
      
 380 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 381 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 382 
     | 
    
         
            +
                                           int   offset_ms,
         
     | 
| 
      
 383 
     | 
    
         
            +
                                           int   n_threads,
         
     | 
| 
      
 384 
     | 
    
         
            +
                                         float * lang_probs);
         
     | 
| 
      
 385 
     | 
    
         
            +
             
     | 
| 
      
 386 
     | 
    
         
            +
                WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
         
     | 
| 
      
 387 
     | 
    
         
            +
                WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
         
     | 
| 
      
 388 
     | 
    
         
            +
                WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
         
     | 
| 
      
 389 
     | 
    
         
            +
                WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
         
     | 
| 
      
 390 
     | 
    
         
            +
                WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
         
     | 
| 
      
 391 
     | 
    
         
            +
                WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
         
     | 
| 
      
 392 
     | 
    
         
            +
             
     | 
| 
      
 393 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
         
     | 
| 
      
 394 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
         
     | 
| 
      
 395 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
         
     | 
| 
      
 396 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
         
     | 
| 
      
 397 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
         
     | 
| 
      
 398 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
         
     | 
| 
      
 399 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
         
     | 
| 
      
 400 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
         
     | 
| 
      
 401 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
         
     | 
| 
      
 402 
     | 
    
         
            +
                WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
         
     | 
| 
      
 403 
     | 
    
         
            +
                WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
         
     | 
| 
      
 404 
     | 
    
         
            +
                WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
         
     | 
| 
       198 
405 
     | 
    
         | 
| 
       199 
406 
     | 
    
         
             
                // Token logits obtained from the last call to whisper_decode()
         
     | 
| 
       200 
407 
     | 
    
         
             
                // The logits for the last token are stored in the last row
         
     | 
| 
       201 
408 
     | 
    
         
             
                // Rows: n_tokens
         
     | 
| 
       202 
409 
     | 
    
         
             
                // Cols: n_vocab
         
     | 
| 
       203 
     | 
    
         
            -
                WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
         
     | 
| 
      
 410 
     | 
    
         
            +
                WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
         
     | 
| 
      
 411 
     | 
    
         
            +
                WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
         
     | 
| 
       204 
412 
     | 
    
         | 
| 
       205 
413 
     | 
    
         
             
                // Token Id -> String. Uses the vocabulary in the provided context
         
     | 
| 
       206 
414 
     | 
    
         
             
                WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
         
     | 
| 
      
 415 
     | 
    
         
            +
                WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
         
     | 
| 
      
 416 
     | 
    
         
            +
             
     | 
| 
       207 
417 
     | 
    
         | 
| 
       208 
418 
     | 
    
         
             
                // Special tokens
         
     | 
| 
       209 
419 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
         
     | 
| 
       210 
420 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
         
     | 
| 
       211 
     | 
    
         
            -
                WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
         
     | 
| 
       212 
421 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
         
     | 
| 
      
 422 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
         
     | 
| 
      
 423 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
         
     | 
| 
       213 
424 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
         
     | 
| 
       214 
425 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
         
     | 
| 
       215 
426 
     | 
    
         
             
                WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
         
     | 
| 
       216 
427 
     | 
    
         | 
| 
       217 
428 
     | 
    
         
             
                // Task tokens
         
     | 
| 
       218 
     | 
    
         
            -
                WHISPER_API whisper_token whisper_token_translate ( 
     | 
| 
       219 
     | 
    
         
            -
                WHISPER_API whisper_token whisper_token_transcribe( 
     | 
| 
      
 429 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
         
     | 
| 
      
 430 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
         
     | 
| 
       220 
431 
     | 
    
         | 
| 
       221 
     | 
    
         
            -
                // Performance information
         
     | 
| 
      
 432 
     | 
    
         
            +
                // Performance information from the default state.
         
     | 
| 
       222 
433 
     | 
    
         
             
                WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
         
     | 
| 
       223 
434 
     | 
    
         
             
                WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
         
     | 
| 
       224 
435 
     | 
    
         | 
| 
         @@ -229,22 +440,36 @@ extern "C" { 
     | 
|
| 
       229 
440 
     | 
    
         | 
| 
       230 
441 
     | 
    
         
             
                // Available sampling strategies
         
     | 
| 
       231 
442 
     | 
    
         
             
                enum whisper_sampling_strategy {
         
     | 
| 
       232 
     | 
    
         
            -
                    WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's  
     | 
| 
      
 443 
     | 
    
         
            +
                    WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreedyDecoder
         
     | 
| 
       233 
444 
     | 
    
         
             
                    WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
         
     | 
| 
       234 
445 
     | 
    
         
             
                };
         
     | 
| 
       235 
446 
     | 
    
         | 
| 
       236 
447 
     | 
    
         
             
                // Text segment callback
         
     | 
| 
       237 
448 
     | 
    
         
             
                // Called on every newly generated text segment
         
     | 
| 
       238 
449 
     | 
    
         
             
                // Use the whisper_full_...() functions to obtain the text segments
         
     | 
| 
       239 
     | 
    
         
            -
                typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
         
     | 
| 
      
 450 
     | 
    
         
            +
                typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
         
     | 
| 
      
 451 
     | 
    
         
            +
             
     | 
| 
      
 452 
     | 
    
         
            +
                // Progress callback
         
     | 
| 
      
 453 
     | 
    
         
            +
                typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
         
     | 
| 
       240 
454 
     | 
    
         | 
| 
       241 
455 
     | 
    
         
             
                // Encoder begin callback
         
     | 
| 
       242 
456 
     | 
    
         
             
                // If not NULL, called before the encoder starts
         
     | 
| 
       243 
457 
     | 
    
         
             
                // If it returns false, the computation is aborted
         
     | 
| 
       244 
     | 
    
         
            -
                typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
         
     | 
| 
      
 458 
     | 
    
         
            +
                typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
         
     | 
| 
      
 459 
     | 
    
         
            +
             
     | 
| 
      
 460 
     | 
    
         
            +
                // Logits filter callback
         
     | 
| 
      
 461 
     | 
    
         
            +
                // Can be used to modify the logits before sampling
         
     | 
| 
      
 462 
     | 
    
         
            +
                // If not NULL, called after applying temperature to logits
         
     | 
| 
      
 463 
     | 
    
         
            +
                typedef void (*whisper_logits_filter_callback)(
         
     | 
| 
      
 464 
     | 
    
         
            +
                        struct whisper_context * ctx,
         
     | 
| 
      
 465 
     | 
    
         
            +
                          struct whisper_state * state,
         
     | 
| 
      
 466 
     | 
    
         
            +
                      const whisper_token_data * tokens,
         
     | 
| 
      
 467 
     | 
    
         
            +
                                           int   n_tokens,
         
     | 
| 
      
 468 
     | 
    
         
            +
                                         float * logits,
         
     | 
| 
      
 469 
     | 
    
         
            +
                                          void * user_data);
         
     | 
| 
       245 
470 
     | 
    
         | 
| 
       246 
471 
     | 
    
         
             
                // Parameters for the whisper_full() function
         
     | 
| 
       247 
     | 
    
         
            -
                // If you  
     | 
| 
      
 472 
     | 
    
         
            +
                // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
         
     | 
| 
       248 
473 
     | 
    
         
             
                // whisper_full_default_params()
         
     | 
| 
       249 
474 
     | 
    
         
             
                struct whisper_full_params {
         
     | 
| 
       250 
475 
     | 
    
         
             
                    enum whisper_sampling_strategy strategy;
         
     | 
| 
         @@ -256,6 +481,7 @@ extern "C" { 
     | 
|
| 
       256 
481 
     | 
    
         | 
| 
       257 
482 
     | 
    
         
             
                    bool translate;
         
     | 
| 
       258 
483 
     | 
    
         
             
                    bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
         
     | 
| 
      
 484 
     | 
    
         
            +
                    bool no_timestamps;     // do not generate timestamps
         
     | 
| 
       259 
485 
     | 
    
         
             
                    bool single_segment;    // force single segment output (useful for streaming)
         
     | 
| 
       260 
486 
     | 
    
         
             
                    bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
         
     | 
| 
       261 
487 
     | 
    
         
             
                    bool print_progress;    // print progress information
         
     | 
| 
         @@ -273,15 +499,26 @@ extern "C" { 
     | 
|
| 
       273 
499 
     | 
    
         
             
                    // [EXPERIMENTAL] speed-up techniques
         
     | 
| 
       274 
500 
     | 
    
         
             
                    // note: these can significantly reduce the quality of the output
         
     | 
| 
       275 
501 
     | 
    
         
             
                    bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
         
     | 
| 
      
 502 
     | 
    
         
            +
                    bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
         
     | 
| 
       276 
503 
     | 
    
         
             
                    int  audio_ctx;         // overwrite the audio context size (0 = use default)
         
     | 
| 
       277 
504 
     | 
    
         | 
| 
      
 505 
     | 
    
         
            +
                    // [EXPERIMENTAL] [TDRZ] tinydiarize
         
     | 
| 
      
 506 
     | 
    
         
            +
                    bool tdrz_enable;       // enable tinydiarize speaker turn detection
         
     | 
| 
      
 507 
     | 
    
         
            +
             
     | 
| 
      
 508 
     | 
    
         
            +
                    // A regular expression that matches tokens to suppress
         
     | 
| 
      
 509 
     | 
    
         
            +
                    const char * suppress_regex;
         
     | 
| 
      
 510 
     | 
    
         
            +
             
     | 
| 
       278 
511 
     | 
    
         
             
                    // tokens to provide to the whisper decoder as initial prompt
         
     | 
| 
       279 
512 
     | 
    
         
             
                    // these are prepended to any existing text context from a previous call
         
     | 
| 
      
 513 
     | 
    
         
            +
                    // use whisper_tokenize() to convert text to tokens
         
     | 
| 
      
 514 
     | 
    
         
            +
                    // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
         
     | 
| 
      
 515 
     | 
    
         
            +
                    const char * initial_prompt;
         
     | 
| 
       280 
516 
     | 
    
         
             
                    const whisper_token * prompt_tokens;
         
     | 
| 
       281 
517 
     | 
    
         
             
                    int prompt_n_tokens;
         
     | 
| 
       282 
518 
     | 
    
         | 
| 
       283 
519 
     | 
    
         
             
                    // for auto-detection, set to nullptr, "" or "auto"
         
     | 
| 
       284 
520 
     | 
    
         
             
                    const char * language;
         
     | 
| 
      
 521 
     | 
    
         
            +
                    bool detect_language;
         
     | 
| 
       285 
522 
     | 
    
         | 
| 
       286 
523 
     | 
    
         
             
                    // common decoding parameters:
         
     | 
| 
       287 
524 
     | 
    
         
             
                    bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
         
     | 
| 
         @@ -312,14 +549,36 @@ extern "C" { 
     | 
|
| 
       312 
549 
     | 
    
         
             
                    whisper_new_segment_callback new_segment_callback;
         
     | 
| 
       313 
550 
     | 
    
         
             
                    void * new_segment_callback_user_data;
         
     | 
| 
       314 
551 
     | 
    
         | 
| 
      
 552 
     | 
    
         
            +
                    // called on each progress update
         
     | 
| 
      
 553 
     | 
    
         
            +
                    whisper_progress_callback progress_callback;
         
     | 
| 
      
 554 
     | 
    
         
            +
                    void * progress_callback_user_data;
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
       315 
556 
     | 
    
         
             
                    // called each time before the encoder starts
         
     | 
| 
       316 
557 
     | 
    
         
             
                    whisper_encoder_begin_callback encoder_begin_callback;
         
     | 
| 
       317 
558 
     | 
    
         
             
                    void * encoder_begin_callback_user_data;
         
     | 
| 
      
 559 
     | 
    
         
            +
             
     | 
| 
      
 560 
     | 
    
         
            +
                    // called each time before ggml computation starts
         
     | 
| 
      
 561 
     | 
    
         
            +
                    ggml_abort_callback abort_callback;
         
     | 
| 
      
 562 
     | 
    
         
            +
                    void * abort_callback_user_data;
         
     | 
| 
      
 563 
     | 
    
         
            +
             
     | 
| 
      
 564 
     | 
    
         
            +
                    // called by each decoder to filter obtained logits
         
     | 
| 
      
 565 
     | 
    
         
            +
                    whisper_logits_filter_callback logits_filter_callback;
         
     | 
| 
      
 566 
     | 
    
         
            +
                    void * logits_filter_callback_user_data;
         
     | 
| 
      
 567 
     | 
    
         
            +
             
     | 
| 
      
 568 
     | 
    
         
            +
                    const whisper_grammar_element ** grammar_rules;
         
     | 
| 
      
 569 
     | 
    
         
            +
                    size_t                           n_grammar_rules;
         
     | 
| 
      
 570 
     | 
    
         
            +
                    size_t                           i_start_rule;
         
     | 
| 
      
 571 
     | 
    
         
            +
                    float                            grammar_penalty;
         
     | 
| 
       318 
572 
     | 
    
         
             
                };
         
     | 
| 
       319 
573 
     | 
    
         | 
| 
      
 574 
     | 
    
         
            +
                // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
         
     | 
| 
      
 575 
     | 
    
         
            +
                WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
         
     | 
| 
      
 576 
     | 
    
         
            +
                WHISPER_API struct whisper_context_params whisper_context_default_params(void);
         
     | 
| 
      
 577 
     | 
    
         
            +
                WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
         
     | 
| 
       320 
578 
     | 
    
         
             
                WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
         
     | 
| 
       321 
579 
     | 
    
         | 
| 
       322 
580 
     | 
    
         
             
                // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
         
     | 
| 
      
 581 
     | 
    
         
            +
                // Not thread safe for same context
         
     | 
| 
       323 
582 
     | 
    
         
             
                // Uses the specified decoding strategy to obtain the text.
         
     | 
| 
       324 
583 
     | 
    
         
             
                WHISPER_API int whisper_full(
         
     | 
| 
       325 
584 
     | 
    
         
             
                            struct whisper_context * ctx,
         
     | 
| 
         @@ -327,7 +586,16 @@ extern "C" { 
     | 
|
| 
       327 
586 
     | 
    
         
             
                                       const float * samples,
         
     | 
| 
       328 
587 
     | 
    
         
             
                                               int   n_samples);
         
     | 
| 
       329 
588 
     | 
    
         | 
| 
       330 
     | 
    
         
            -
                 
     | 
| 
      
 589 
     | 
    
         
            +
                WHISPER_API int whisper_full_with_state(
         
     | 
| 
      
 590 
     | 
    
         
            +
                            struct whisper_context * ctx,
         
     | 
| 
      
 591 
     | 
    
         
            +
                              struct whisper_state * state,
         
     | 
| 
      
 592 
     | 
    
         
            +
                        struct whisper_full_params   params,
         
     | 
| 
      
 593 
     | 
    
         
            +
                                       const float * samples,
         
     | 
| 
      
 594 
     | 
    
         
            +
                                               int   n_samples);
         
     | 
| 
      
 595 
     | 
    
         
            +
             
     | 
| 
      
 596 
     | 
    
         
            +
                // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
         
     | 
| 
      
 597 
     | 
    
         
            +
                // Result is stored in the default state of the context
         
     | 
| 
      
 598 
     | 
    
         
            +
                // Not thread safe if executed in parallel on the same context.
         
     | 
| 
       331 
599 
     | 
    
         
             
                // It seems this approach can offer some speedup in some cases.
         
     | 
| 
       332 
600 
     | 
    
         
             
                // However, the transcription accuracy can be worse at the beginning and end of each chunk.
         
     | 
| 
       333 
601 
     | 
    
         
             
                WHISPER_API int whisper_full_parallel(
         
     | 
| 
         @@ -337,40 +605,64 @@ extern "C" { 
     | 
|
| 
       337 
605 
     | 
    
         
             
                                               int   n_samples,
         
     | 
| 
       338 
606 
     | 
    
         
             
                                               int   n_processors);
         
     | 
| 
       339 
607 
     | 
    
         | 
| 
       340 
     | 
    
         
            -
                // Number of generated text segments 
     | 
| 
      
 608 
     | 
    
         
            +
                // Number of generated text segments
         
     | 
| 
       341 
609 
     | 
    
         
             
                // A segment can be a few words, a sentence, or even a paragraph.
         
     | 
| 
       342 
     | 
    
         
            -
                WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
         
     | 
| 
      
 610 
     | 
    
         
            +
                WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
         
     | 
| 
      
 611 
     | 
    
         
            +
                WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
         
     | 
| 
       343 
612 
     | 
    
         | 
| 
       344 
     | 
    
         
            -
                // Language id associated with the  
     | 
| 
      
 613 
     | 
    
         
            +
                // Language id associated with the context's default state
         
     | 
| 
       345 
614 
     | 
    
         
             
                WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
         
     | 
| 
       346 
615 
     | 
    
         | 
| 
       347 
     | 
    
         
            -
                //  
     | 
| 
       348 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
       349 
     | 
    
         
            -
             
     | 
| 
      
 616 
     | 
    
         
            +
                // Language id associated with the provided state
         
     | 
| 
      
 617 
     | 
    
         
            +
                WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
         
     | 
| 
      
 618 
     | 
    
         
            +
             
     | 
| 
      
 619 
     | 
    
         
            +
                // Get the start and end time of the specified segment
         
     | 
| 
      
 620 
     | 
    
         
            +
                WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
         
     | 
| 
      
 621 
     | 
    
         
            +
                WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
         
     | 
| 
      
 622 
     | 
    
         
            +
             
     | 
| 
      
 623 
     | 
    
         
            +
                WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
         
     | 
| 
      
 624 
     | 
    
         
            +
                WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
         
     | 
| 
       350 
625 
     | 
    
         | 
| 
       351 
     | 
    
         
            -
                // Get the  
     | 
| 
       352 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
      
 626 
     | 
    
         
            +
                // Get whether the next segment is predicted as a speaker turn
         
     | 
| 
      
 627 
     | 
    
         
            +
                WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
         
     | 
| 
      
 628 
     | 
    
         
            +
                WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
         
     | 
| 
       353 
629 
     | 
    
         | 
| 
       354 
     | 
    
         
            -
                // Get  
     | 
| 
       355 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
      
 630 
     | 
    
         
            +
                // Get the text of the specified segment
         
     | 
| 
      
 631 
     | 
    
         
            +
                WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
         
     | 
| 
      
 632 
     | 
    
         
            +
                WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
         
     | 
| 
       356 
633 
     | 
    
         | 
| 
       357 
     | 
    
         
            -
                // Get  
     | 
| 
       358 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
       359 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
      
 634 
     | 
    
         
            +
                // Get number of tokens in the specified segment
         
     | 
| 
      
 635 
     | 
    
         
            +
                WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
         
     | 
| 
      
 636 
     | 
    
         
            +
                WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
         
     | 
| 
       360 
637 
     | 
    
         | 
| 
       361 
     | 
    
         
            -
                // Get token  
     | 
| 
      
 638 
     | 
    
         
            +
                // Get the token text of the specified token in the specified segment
         
     | 
| 
      
 639 
     | 
    
         
            +
                WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 640 
     | 
    
         
            +
                WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
         
     | 
| 
      
 641 
     | 
    
         
            +
             
     | 
| 
      
 642 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 643 
     | 
    
         
            +
                WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
         
     | 
| 
      
 644 
     | 
    
         
            +
             
     | 
| 
      
 645 
     | 
    
         
            +
                // Get token data for the specified token in the specified segment
         
     | 
| 
       362 
646 
     | 
    
         
             
                // This contains probabilities, timestamps, etc.
         
     | 
| 
       363 
     | 
    
         
            -
                WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 647 
     | 
    
         
            +
                WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 648 
     | 
    
         
            +
                WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
         
     | 
| 
       364 
649 
     | 
    
         | 
| 
       365 
     | 
    
         
            -
                // Get the probability of the specified token in the specified segment 
     | 
| 
       366 
     | 
    
         
            -
                WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 650 
     | 
    
         
            +
                // Get the probability of the specified token in the specified segment
         
     | 
| 
      
 651 
     | 
    
         
            +
                WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
         
     | 
| 
      
 652 
     | 
    
         
            +
                WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
         
     | 
| 
       367 
653 
     | 
    
         | 
| 
       368 
654 
     | 
    
         
             
                ////////////////////////////////////////////////////////////////////////////
         
     | 
| 
       369 
655 
     | 
    
         | 
| 
       370 
656 
     | 
    
         
             
                // Temporary helpers needed for exposing ggml interface
         
     | 
| 
       371 
657 
     | 
    
         | 
| 
       372 
     | 
    
         
            -
                WHISPER_API int 
     | 
| 
       373 
     | 
    
         
            -
                WHISPER_API  
     | 
| 
      
 658 
     | 
    
         
            +
                WHISPER_API int          whisper_bench_memcpy          (int n_threads);
         
     | 
| 
      
 659 
     | 
    
         
            +
                WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
         
     | 
| 
      
 660 
     | 
    
         
            +
                WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
         
     | 
| 
      
 661 
     | 
    
         
            +
                WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
         
     | 
| 
      
 662 
     | 
    
         
            +
             
     | 
| 
      
 663 
     | 
    
         
            +
                // Control logging output; default behavior is to print to stderr
         
     | 
| 
      
 664 
     | 
    
         
            +
             
     | 
| 
      
 665 
     | 
    
         
            +
                WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
         
     | 
| 
       374 
666 
     | 
    
         | 
| 
       375 
667 
     | 
    
         
             
            #ifdef __cplusplus
         
     | 
| 
       376 
668 
     | 
    
         
             
            }
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: whispercpp
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 1.3.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Georgi Gerganov
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire:
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date:  
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2024-05-14 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       14 
14 
     | 
    
         
             
            description: High-performance inference of OpenAI's Whisper automatic speech recognition
         
     | 
| 
       15 
15 
     | 
    
         
             
              (ASR) model via Ruby
         
     | 
| 
         @@ -55,7 +55,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       55 
55 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       56 
56 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       57 
57 
     | 
    
         
             
            requirements: []
         
     | 
| 
       58 
     | 
    
         
            -
            rubygems_version: 3. 
     | 
| 
      
 58 
     | 
    
         
            +
            rubygems_version: 3.5.9
         
     | 
| 
       59 
59 
     | 
    
         
             
            signing_key:
         
     | 
| 
       60 
60 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       61 
61 
     | 
    
         
             
            summary: Ruby whisper.cpp bindings
         
     |