llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
    
        data/ext/llama_cpp/src/llama.h
    CHANGED
    
    | @@ -37,6 +37,8 @@ | |
| 37 37 |  | 
| 38 38 | 
             
            #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
         | 
| 39 39 |  | 
| 40 | 
            +
            #define LLAMA_MAX_RNG_STATE (64*1024)
         | 
| 41 | 
            +
             | 
| 40 42 | 
             
            #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
         | 
| 41 43 |  | 
| 42 44 | 
             
            #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
         | 
| @@ -60,13 +62,9 @@ extern "C" { | |
| 60 62 | 
             
                struct llama_model;
         | 
| 61 63 | 
             
                struct llama_context;
         | 
| 62 64 |  | 
| 63 | 
            -
                typedef  | 
| 64 | 
            -
             | 
| 65 | 
            -
                 | 
| 66 | 
            -
                    LLAMA_LOG_LEVEL_ERROR = 2,
         | 
| 67 | 
            -
                    LLAMA_LOG_LEVEL_WARN  = 3,
         | 
| 68 | 
            -
                    LLAMA_LOG_LEVEL_INFO  = 4
         | 
| 69 | 
            -
                };
         | 
| 65 | 
            +
                typedef int32_t llama_pos;
         | 
| 66 | 
            +
                typedef int32_t llama_token;
         | 
| 67 | 
            +
                typedef int32_t llama_seq_id;
         | 
| 70 68 |  | 
| 71 69 | 
             
                enum llama_vocab_type {
         | 
| 72 70 | 
             
                    LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
         | 
| @@ -86,24 +84,24 @@ extern "C" { | |
| 86 84 | 
             
                // model file types
         | 
| 87 85 | 
             
                enum llama_ftype {
         | 
| 88 86 | 
             
                    LLAMA_FTYPE_ALL_F32              = 0,
         | 
| 89 | 
            -
                    LLAMA_FTYPE_MOSTLY_F16           = 1, | 
| 90 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q4_0          = 2, | 
| 91 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q4_1          = 3, | 
| 92 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, | 
| 93 | 
            -
                    // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, | 
| 94 | 
            -
                    // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, | 
| 95 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q8_0          = 7, | 
| 96 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q5_0          = 8, | 
| 97 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q5_1          = 9, | 
| 98 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q2_K          = 10 | 
| 99 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11 | 
| 100 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12 | 
| 101 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13 | 
| 102 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14 | 
| 103 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15 | 
| 104 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16 | 
| 105 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17 | 
| 106 | 
            -
                    LLAMA_FTYPE_MOSTLY_Q6_K          = 18 | 
| 87 | 
            +
                    LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
         | 
| 88 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
         | 
| 89 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
         | 
| 90 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
         | 
| 91 | 
            +
                    // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
         | 
| 92 | 
            +
                    // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
         | 
| 93 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
         | 
| 94 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
         | 
| 95 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
         | 
| 96 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
         | 
| 97 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
         | 
| 98 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
         | 
| 99 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
         | 
| 100 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
         | 
| 101 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
         | 
| 102 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
         | 
| 103 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
         | 
| 104 | 
            +
                    LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
         | 
| 107 105 |  | 
| 108 106 | 
             
                    LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
         | 
| 109 107 | 
             
                };
         | 
| @@ -122,41 +120,68 @@ extern "C" { | |
| 122 120 |  | 
| 123 121 | 
             
                typedef void (*llama_progress_callback)(float progress, void *ctx);
         | 
| 124 122 |  | 
| 125 | 
            -
                 | 
| 126 | 
            -
             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
             | 
| 130 | 
            -
             | 
| 131 | 
            -
             | 
| 123 | 
            +
                // Input data for llama_decode
         | 
| 124 | 
            +
                // A llama_batch object can contain input about one or many sequences
         | 
| 125 | 
            +
                // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
         | 
| 126 | 
            +
                //
         | 
| 127 | 
            +
                // - token  : the token ids of the input (used when embd is NULL)
         | 
| 128 | 
            +
                // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
         | 
| 129 | 
            +
                // - pos    : the positions of the respective token in the sequence
         | 
| 130 | 
            +
                // - seq_id : the sequence to which the respective token belongs
         | 
| 131 | 
            +
                // - logits : if zero, the logits for the respective token will not be output
         | 
| 132 | 
            +
                //
         | 
| 133 | 
            +
                typedef struct llama_batch {
         | 
| 134 | 
            +
                    int32_t n_tokens;
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    llama_token  * token;
         | 
| 137 | 
            +
                    float        * embd;
         | 
| 138 | 
            +
                    llama_pos    * pos;
         | 
| 139 | 
            +
                    llama_seq_id * seq_id;
         | 
| 140 | 
            +
                    int8_t       * logits;
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    // NOTE: helpers for smooth API transition - can be deprecated in the future
         | 
| 143 | 
            +
                    //       for future-proof code, use the above fields instead and ignore everything below
         | 
| 144 | 
            +
                    //
         | 
| 145 | 
            +
                    // pos[i] = all_pos_0 + i*all_pos_1
         | 
| 146 | 
            +
                    //
         | 
| 147 | 
            +
                    llama_pos    all_pos_0;  // used if pos == NULL
         | 
| 148 | 
            +
                    llama_pos    all_pos_1;  // used if pos == NULL
         | 
| 149 | 
            +
                    llama_seq_id all_seq_id; // used if seq_id == NULL
         | 
| 150 | 
            +
                } llama_batch;
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                struct llama_model_params {
         | 
| 153 | 
            +
                    int32_t n_gpu_layers; // number of layers to store in VRAM
         | 
| 154 | 
            +
                    int32_t main_gpu;     // the GPU that is used for scratch and small tensors
         | 
| 132 155 | 
             
                    const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
         | 
| 133 156 |  | 
| 134 | 
            -
                    // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         | 
| 135 | 
            -
                    float    rope_freq_base;  // RoPE base frequency
         | 
| 136 | 
            -
                    float    rope_freq_scale; // RoPE frequency scaling factor
         | 
| 137 | 
            -
             | 
| 138 157 | 
             
                    // called with a progress value between 0 and 1, pass NULL to disable
         | 
| 139 158 | 
             
                    llama_progress_callback progress_callback;
         | 
| 140 159 | 
             
                    // context pointer passed to the progress callback
         | 
| 141 160 | 
             
                    void * progress_callback_user_data;
         | 
| 142 161 |  | 
| 143 162 | 
             
                    // Keep the booleans together to avoid misalignment during copy-by-value.
         | 
| 144 | 
            -
                    bool low_vram;   // if true, reduce VRAM usage at the cost of performance
         | 
| 145 | 
            -
                    bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
         | 
| 146 | 
            -
                    bool f16_kv;     // use fp16 for KV cache
         | 
| 147 | 
            -
                    bool logits_all; // the llama_eval() call computes all logits, not just the last one
         | 
| 148 163 | 
             
                    bool vocab_only; // only load the vocabulary, no weights
         | 
| 149 164 | 
             
                    bool use_mmap;   // use mmap if possible
         | 
| 150 165 | 
             
                    bool use_mlock;  // force system to keep model in RAM
         | 
| 151 | 
            -
                    bool embedding;  // embedding mode only
         | 
| 152 166 | 
             
                };
         | 
| 153 167 |  | 
| 154 | 
            -
                 | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 168 | 
            +
                struct llama_context_params {
         | 
| 169 | 
            +
                    uint32_t seed;            // RNG seed, -1 for random
         | 
| 170 | 
            +
                    uint32_t n_ctx;           // text context
         | 
| 171 | 
            +
                    uint32_t n_batch;         // prompt processing batch size
         | 
| 172 | 
            +
                    uint32_t n_threads;       // number of threads to use for generation
         | 
| 173 | 
            +
                    uint32_t n_threads_batch; // number of threads to use for batch processing
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         | 
| 176 | 
            +
                    float rope_freq_base;  // RoPE base frequency
         | 
| 177 | 
            +
                    float rope_freq_scale; // RoPE frequency scaling factor
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                    // Keep the booleans together to avoid misalignment during copy-by-value.
         | 
| 180 | 
            +
                    bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
         | 
| 181 | 
            +
                    bool f16_kv;     // use fp16 for KV cache
         | 
| 182 | 
            +
                    bool logits_all; // the llama_eval() call computes all logits, not just the last one
         | 
| 183 | 
            +
                    bool embedding;  // embedding mode only
         | 
| 184 | 
            +
                };
         | 
| 160 185 |  | 
| 161 186 | 
             
                // model quantization parameters
         | 
| 162 187 | 
             
                typedef struct llama_model_quantize_params {
         | 
| @@ -215,6 +240,8 @@ extern "C" { | |
| 215 240 | 
             
                    int32_t n_eval;
         | 
| 216 241 | 
             
                };
         | 
| 217 242 |  | 
| 243 | 
            +
                // Helpers for getting default parameters
         | 
| 244 | 
            +
                LLAMA_API struct llama_model_params llama_model_default_params(void);
         | 
| 218 245 | 
             
                LLAMA_API struct llama_context_params llama_context_default_params(void);
         | 
| 219 246 | 
             
                LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
         | 
| 220 247 |  | 
| @@ -228,7 +255,7 @@ extern "C" { | |
| 228 255 |  | 
| 229 256 | 
             
                LLAMA_API struct llama_model * llama_load_model_from_file(
         | 
| 230 257 | 
             
                                         const char * path_model,
         | 
| 231 | 
            -
                        struct  | 
| 258 | 
            +
                        struct llama_model_params     params);
         | 
| 232 259 |  | 
| 233 260 | 
             
                LLAMA_API void llama_free_model(struct llama_model * model);
         | 
| 234 261 |  | 
| @@ -245,25 +272,28 @@ extern "C" { | |
| 245 272 | 
             
                LLAMA_API bool llama_mmap_supported (void);
         | 
| 246 273 | 
             
                LLAMA_API bool llama_mlock_supported(void);
         | 
| 247 274 |  | 
| 248 | 
            -
                LLAMA_API  | 
| 275 | 
            +
                LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
         | 
| 276 | 
            +
             | 
| 249 277 | 
             
                LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
         | 
| 250 | 
            -
                LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
         | 
| 251 | 
            -
                LLAMA_API int llama_n_embd     (const struct llama_context * ctx);
         | 
| 252 278 |  | 
| 253 | 
            -
                LLAMA_API enum llama_vocab_type llama_vocab_type(const struct  | 
| 279 | 
            +
                LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
         | 
| 254 280 |  | 
| 255 | 
            -
                LLAMA_API int  | 
| 256 | 
            -
                LLAMA_API int  | 
| 257 | 
            -
                LLAMA_API int  | 
| 258 | 
            -
                LLAMA_API int llama_model_n_embd     (const struct llama_model * model);
         | 
| 281 | 
            +
                LLAMA_API int llama_n_vocab    (const struct llama_model * model);
         | 
| 282 | 
            +
                LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
         | 
| 283 | 
            +
                LLAMA_API int llama_n_embd     (const struct llama_model * model);
         | 
| 259 284 |  | 
| 260 285 | 
             
                // Get a string describing the model type
         | 
| 261 286 | 
             
                LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
         | 
| 287 | 
            +
             | 
| 262 288 | 
             
                // Returns the total size of all the tensors in the model in bytes
         | 
| 263 289 | 
             
                LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
         | 
| 290 | 
            +
             | 
| 264 291 | 
             
                // Returns the total number of parameters in the model
         | 
| 265 292 | 
             
                LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
         | 
| 266 293 |  | 
| 294 | 
            +
                // Get a llama model tensor
         | 
| 295 | 
            +
                LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
         | 
| 296 | 
            +
             | 
| 267 297 | 
             
                // Returns 0 on success
         | 
| 268 298 | 
             
                LLAMA_API int llama_model_quantize(
         | 
| 269 299 | 
             
                        const char * fname_inp,
         | 
| @@ -279,21 +309,65 @@ extern "C" { | |
| 279 309 | 
             
                LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
         | 
| 280 310 | 
             
                        struct llama_context * ctx,
         | 
| 281 311 | 
             
                                  const char * path_lora,
         | 
| 312 | 
            +
                                       float   scale,
         | 
| 282 313 | 
             
                                  const char * path_base_model,
         | 
| 283 314 | 
             
                                         int   n_threads),
         | 
| 284 | 
            -
                        " | 
| 315 | 
            +
                        "use llama_model_apply_lora_from_file instead");
         | 
| 285 316 |  | 
| 286 317 | 
             
                LLAMA_API int llama_model_apply_lora_from_file(
         | 
| 287 318 | 
             
                        const struct llama_model * model,
         | 
| 288 | 
            -
             | 
| 289 | 
            -
             | 
| 290 | 
            -
             | 
| 319 | 
            +
                                  const char * path_lora,
         | 
| 320 | 
            +
                                       float   scale,
         | 
| 321 | 
            +
                                  const char * path_base_model,
         | 
| 322 | 
            +
                                         int   n_threads);
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                //
         | 
| 325 | 
            +
                // KV cache
         | 
| 326 | 
            +
                //
         | 
| 291 327 |  | 
| 292 328 | 
             
                // Returns the number of tokens in the KV cache
         | 
| 293 | 
            -
                LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx) | 
| 329 | 
            +
                LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
         | 
| 330 | 
            +
                        "avoid using this, it will be removed in the future, instead - count the tokens in user code");
         | 
| 294 331 |  | 
| 295 | 
            -
                //  | 
| 296 | 
            -
                LLAMA_API void  | 
| 332 | 
            +
                // Remove all tokens data of cells in [c0, c1)
         | 
| 333 | 
            +
                LLAMA_API void llama_kv_cache_tokens_rm(
         | 
| 334 | 
            +
                        struct llama_context * ctx,
         | 
| 335 | 
            +
                                     int32_t   c0,
         | 
| 336 | 
            +
                                     int32_t   c1);
         | 
| 337 | 
            +
             | 
| 338 | 
            +
                // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
         | 
| 339 | 
            +
                LLAMA_API void llama_kv_cache_seq_rm(
         | 
| 340 | 
            +
                        struct llama_context * ctx,
         | 
| 341 | 
            +
                                llama_seq_id   seq_id,
         | 
| 342 | 
            +
                                   llama_pos   p0,
         | 
| 343 | 
            +
                                   llama_pos   p1);
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                // Copy all tokens that belong to the specified sequence to another sequence
         | 
| 346 | 
            +
                // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
         | 
| 347 | 
            +
                LLAMA_API void llama_kv_cache_seq_cp(
         | 
| 348 | 
            +
                        struct llama_context * ctx,
         | 
| 349 | 
            +
                                llama_seq_id   seq_id_src,
         | 
| 350 | 
            +
                                llama_seq_id   seq_id_dst,
         | 
| 351 | 
            +
                                   llama_pos   p0,
         | 
| 352 | 
            +
                                   llama_pos   p1);
         | 
| 353 | 
            +
             | 
| 354 | 
            +
                // Removes all tokens that do not belong to the specified sequence
         | 
| 355 | 
            +
                LLAMA_API void llama_kv_cache_seq_keep(
         | 
| 356 | 
            +
                        struct llama_context * ctx,
         | 
| 357 | 
            +
                                llama_seq_id   seq_id);
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
         | 
| 360 | 
            +
                // If the KV cache is RoPEd, the KV data is updated accordingly
         | 
| 361 | 
            +
                LLAMA_API void llama_kv_cache_seq_shift(
         | 
| 362 | 
            +
                        struct llama_context * ctx,
         | 
| 363 | 
            +
                                llama_seq_id   seq_id,
         | 
| 364 | 
            +
                                   llama_pos   p0,
         | 
| 365 | 
            +
                                   llama_pos   p1,
         | 
| 366 | 
            +
                                   llama_pos   delta);
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                //
         | 
| 369 | 
            +
                // State / sessions
         | 
| 370 | 
            +
                //
         | 
| 297 371 |  | 
| 298 372 | 
             
                // Returns the maximum size in bytes of the state (rng, logits, embedding
         | 
| 299 373 | 
             
                // and kv_cache) - will often be smaller after compacting tokens
         | 
| @@ -302,48 +376,102 @@ extern "C" { | |
| 302 376 | 
             
                // Copies the state to the specified destination address.
         | 
| 303 377 | 
             
                // Destination needs to have allocated enough memory.
         | 
| 304 378 | 
             
                // Returns the number of bytes copied
         | 
| 305 | 
            -
                LLAMA_API size_t llama_copy_state_data( | 
| 379 | 
            +
                LLAMA_API size_t llama_copy_state_data(
         | 
| 380 | 
            +
                        struct llama_context * ctx,
         | 
| 381 | 
            +
                                     uint8_t * dst);
         | 
| 306 382 |  | 
| 307 383 | 
             
                // Set the state reading from the specified address
         | 
| 308 384 | 
             
                // Returns the number of bytes read
         | 
| 309 | 
            -
                LLAMA_API size_t llama_set_state_data( | 
| 385 | 
            +
                LLAMA_API size_t llama_set_state_data(
         | 
| 386 | 
            +
                        struct llama_context * ctx,
         | 
| 387 | 
            +
                                     uint8_t * src);
         | 
| 310 388 |  | 
| 311 389 | 
             
                // Save/load session file
         | 
| 312 | 
            -
                LLAMA_API bool llama_load_session_file( | 
| 313 | 
            -
             | 
| 390 | 
            +
                LLAMA_API bool llama_load_session_file(
         | 
| 391 | 
            +
                        struct llama_context * ctx,
         | 
| 392 | 
            +
                                  const char * path_session,
         | 
| 393 | 
            +
                                 llama_token * tokens_out,
         | 
| 394 | 
            +
                                      size_t   n_token_capacity,
         | 
| 395 | 
            +
                                      size_t * n_token_count_out);
         | 
| 396 | 
            +
             | 
| 397 | 
            +
                LLAMA_API bool llama_save_session_file(
         | 
| 398 | 
            +
                        struct llama_context * ctx,
         | 
| 399 | 
            +
                                  const char * path_session,
         | 
| 400 | 
            +
                           const llama_token * tokens,
         | 
| 401 | 
            +
                                      size_t   n_token_count);
         | 
| 402 | 
            +
             | 
| 403 | 
            +
                //
         | 
| 404 | 
            +
                // Decoding
         | 
| 405 | 
            +
                //
         | 
| 314 406 |  | 
| 315 | 
            -
                // Run the llama inference to obtain the logits and probabilities for the next token.
         | 
| 407 | 
            +
                // Run the llama inference to obtain the logits and probabilities for the next token(s).
         | 
| 316 408 | 
             
                // tokens + n_tokens is the provided batch of new tokens to process
         | 
| 317 409 | 
             
                // n_past is the number of tokens to use from previous eval calls
         | 
| 318 410 | 
             
                // Returns 0 on success
         | 
| 319 | 
            -
                 | 
| 411 | 
            +
                // DEPRECATED: use llama_decode() instead
         | 
| 412 | 
            +
                LLAMA_API DEPRECATED(int llama_eval(
         | 
| 320 413 | 
             
                        struct llama_context * ctx,
         | 
| 321 | 
            -
             | 
| 322 | 
            -
             | 
| 323 | 
            -
                                         int   n_past,
         | 
| 324 | 
            -
             | 
| 414 | 
            +
                                 llama_token * tokens,
         | 
| 415 | 
            +
                                     int32_t   n_tokens,
         | 
| 416 | 
            +
                                         int   n_past),
         | 
| 417 | 
            +
                        "use llama_decode() instead");
         | 
| 325 418 |  | 
| 326 419 | 
             
                // Same as llama_eval, but use float matrix input directly.
         | 
| 327 | 
            -
                 | 
| 420 | 
            +
                // DEPRECATED: use llama_decode() instead
         | 
| 421 | 
            +
                LLAMA_API DEPRECATED(int llama_eval_embd(
         | 
| 328 422 | 
             
                        struct llama_context * ctx,
         | 
| 329 | 
            -
             | 
| 330 | 
            -
             | 
| 331 | 
            -
                                         int   n_past,
         | 
| 332 | 
            -
             | 
| 423 | 
            +
                                       float * embd,
         | 
| 424 | 
            +
                                     int32_t   n_tokens,
         | 
| 425 | 
            +
                                         int   n_past),
         | 
| 426 | 
            +
                        "use llama_decode() instead");
         | 
| 333 427 |  | 
| 334 | 
            -
                //  | 
| 335 | 
            -
                // | 
| 336 | 
            -
                // | 
| 337 | 
            -
                // | 
| 338 | 
            -
                LLAMA_API  | 
| 428 | 
            +
                // Return batch for single sequence of tokens starting at pos_0
         | 
| 429 | 
            +
                //
         | 
| 430 | 
            +
                // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
         | 
| 431 | 
            +
                //
         | 
| 432 | 
            +
                LLAMA_API struct llama_batch llama_batch_get_one(
         | 
| 433 | 
            +
                              llama_token * tokens,
         | 
| 434 | 
            +
                                  int32_t   n_tokens,
         | 
| 435 | 
            +
                                llama_pos   pos_0,
         | 
| 436 | 
            +
                             llama_seq_id   seq_id);
         | 
| 437 | 
            +
             | 
| 438 | 
            +
                // Allocates a batch of tokens on the heap
         | 
| 439 | 
            +
                // The batch has to be freed with llama_batch_free()
         | 
| 440 | 
            +
                // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
         | 
| 441 | 
            +
                // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
         | 
| 442 | 
            +
                // The rest of the llama_batch members are allocated with size n_tokens
         | 
| 443 | 
            +
                // All members are left uninitialized
         | 
| 444 | 
            +
                LLAMA_API struct llama_batch llama_batch_init(
         | 
| 445 | 
            +
                        int32_t n_tokens,
         | 
| 446 | 
            +
                        int32_t embd);
         | 
| 447 | 
            +
             | 
| 448 | 
            +
                // Frees a batch of tokens allocated with llama_batch_init()
         | 
| 449 | 
            +
                LLAMA_API void llama_batch_free(struct llama_batch batch);
         | 
| 450 | 
            +
             | 
| 451 | 
            +
                // Positive return values does not mean a fatal error, but rather a warning.
         | 
| 452 | 
            +
                //   0 - success
         | 
| 453 | 
            +
                //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
         | 
| 454 | 
            +
                // < 0 - error
         | 
| 455 | 
            +
                LLAMA_API int llama_decode(
         | 
| 456 | 
            +
                        struct llama_context * ctx,
         | 
| 457 | 
            +
                          struct llama_batch   batch);
         | 
| 458 | 
            +
             | 
| 459 | 
            +
                // Set the number of threads used for decoding
         | 
| 460 | 
            +
                // n_threads is the number of threads used for generation (single token)
         | 
| 461 | 
            +
                // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
         | 
| 462 | 
            +
                LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
         | 
| 339 463 |  | 
| 340 464 | 
             
                // Token logits obtained from the last call to llama_eval()
         | 
| 341 465 | 
             
                // The logits for the last token are stored in the last row
         | 
| 342 | 
            -
                //  | 
| 343 | 
            -
                // Rows: n_tokens
         | 
| 466 | 
            +
                // Logits for which llama_batch.logits[i] == 0 are undefined
         | 
| 467 | 
            +
                // Rows: n_tokens provided with llama_batch
         | 
| 344 468 | 
             
                // Cols: n_vocab
         | 
| 345 469 | 
             
                LLAMA_API float * llama_get_logits(struct llama_context * ctx);
         | 
| 346 470 |  | 
| 471 | 
            +
                // Logits for the ith token. Equivalent to:
         | 
| 472 | 
            +
                // llama_get_logits(ctx) + i*n_vocab
         | 
| 473 | 
            +
                LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
         | 
| 474 | 
            +
             | 
| 347 475 | 
             
                // Get the embeddings for the input
         | 
| 348 476 | 
             
                // shape: [n_embd] (1-dimensional)
         | 
| 349 477 | 
             
                LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
         | 
| @@ -372,14 +500,6 @@ extern "C" { | |
| 372 500 | 
             
                // Returns the number of tokens on success, no more than n_max_tokens
         | 
| 373 501 | 
             
                // Returns a negative number on failure - the number of tokens that would have been returned
         | 
| 374 502 | 
             
                LLAMA_API int llama_tokenize(
         | 
| 375 | 
            -
                        struct llama_context * ctx,
         | 
| 376 | 
            -
                                  const char * text,
         | 
| 377 | 
            -
                                         int   text_len,
         | 
| 378 | 
            -
                                 llama_token * tokens,
         | 
| 379 | 
            -
                                         int   n_max_tokens,
         | 
| 380 | 
            -
                                        bool   add_bos);
         | 
| 381 | 
            -
             | 
| 382 | 
            -
                LLAMA_API int llama_tokenize_with_model(
         | 
| 383 503 | 
             
                    const struct llama_model * model,
         | 
| 384 504 | 
             
                                  const char * text,
         | 
| 385 505 | 
             
                                         int   text_len,
         | 
| @@ -392,12 +512,6 @@ extern "C" { | |
| 392 512 | 
             
                // Does not write null terminator to the buffer.
         | 
| 393 513 | 
             
                // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
         | 
| 394 514 | 
             
                LLAMA_API int llama_token_to_piece(
         | 
| 395 | 
            -
                        const struct llama_context * ctx,
         | 
| 396 | 
            -
                                       llama_token   token,
         | 
| 397 | 
            -
                                              char * buf,
         | 
| 398 | 
            -
                                              int    length);
         | 
| 399 | 
            -
             | 
| 400 | 
            -
                LLAMA_API int llama_token_to_piece_with_model(
         | 
| 401 515 | 
             
                          const struct llama_model * model,
         | 
| 402 516 | 
             
                                       llama_token   token,
         | 
| 403 517 | 
             
                                              char * buf,
         | 
| @@ -420,11 +534,25 @@ extern "C" { | |
| 420 534 | 
             
                // Sampling functions
         | 
| 421 535 | 
             
                //
         | 
| 422 536 |  | 
| 537 | 
            +
                // Sets the current rng seed.
         | 
| 538 | 
            +
                LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
         | 
| 539 | 
            +
             | 
| 423 540 | 
             
                /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
         | 
| 424 | 
            -
                LLAMA_API void llama_sample_repetition_penalty( | 
| 541 | 
            +
                LLAMA_API void llama_sample_repetition_penalty(
         | 
| 542 | 
            +
                        struct llama_context * ctx,
         | 
| 543 | 
            +
                      llama_token_data_array * candidates,
         | 
| 544 | 
            +
                           const llama_token * last_tokens,
         | 
| 545 | 
            +
                                      size_t   last_tokens_size,
         | 
| 546 | 
            +
                                      float    penalty);
         | 
| 425 547 |  | 
| 426 548 | 
             
                /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
         | 
| 427 | 
            -
                LLAMA_API void llama_sample_frequency_and_presence_penalties( | 
| 549 | 
            +
                LLAMA_API void llama_sample_frequency_and_presence_penalties(
         | 
| 550 | 
            +
                        struct llama_context * ctx,
         | 
| 551 | 
            +
                      llama_token_data_array * candidates,
         | 
| 552 | 
            +
                           const llama_token * last_tokens,
         | 
| 553 | 
            +
                                      size_t   last_tokens_size,
         | 
| 554 | 
            +
                                       float   alpha_frequency,
         | 
| 555 | 
            +
                                       float   alpha_presence);
         | 
| 428 556 |  | 
| 429 557 | 
             
                /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
         | 
| 430 558 | 
             
                /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
         | 
| @@ -437,23 +565,54 @@ extern "C" { | |
| 437 565 | 
             
                                         float   scale);
         | 
| 438 566 |  | 
| 439 567 | 
             
                /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
         | 
| 440 | 
            -
                LLAMA_API void llama_sample_softmax( | 
| 568 | 
            +
                LLAMA_API void llama_sample_softmax(
         | 
| 569 | 
            +
                        struct llama_context * ctx,
         | 
| 570 | 
            +
                      llama_token_data_array * candidates);
         | 
| 441 571 |  | 
| 442 572 | 
             
                /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
         | 
| 443 | 
            -
                LLAMA_API void llama_sample_top_k( | 
| 573 | 
            +
                LLAMA_API void llama_sample_top_k(
         | 
| 574 | 
            +
                        struct llama_context * ctx,
         | 
| 575 | 
            +
                      llama_token_data_array * candidates,
         | 
| 576 | 
            +
                                         int   k,
         | 
| 577 | 
            +
                                      size_t   min_keep);
         | 
| 444 578 |  | 
| 445 579 | 
             
                /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
         | 
| 446 | 
            -
                LLAMA_API void llama_sample_top_p( | 
| 580 | 
            +
                LLAMA_API void llama_sample_top_p(
         | 
| 581 | 
            +
                        struct llama_context * ctx,
         | 
| 582 | 
            +
                      llama_token_data_array * candidates,
         | 
| 583 | 
            +
                                       float   p,
         | 
| 584 | 
            +
                                      size_t   min_keep);
         | 
| 447 585 |  | 
| 448 586 | 
             
                /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
         | 
| 449 | 
            -
                LLAMA_API void llama_sample_tail_free( | 
| 587 | 
            +
                LLAMA_API void llama_sample_tail_free(
         | 
| 588 | 
            +
                        struct llama_context * ctx,
         | 
| 589 | 
            +
                      llama_token_data_array * candidates,
         | 
| 590 | 
            +
                                       float   z,
         | 
| 591 | 
            +
                                      size_t   min_keep);
         | 
| 450 592 |  | 
| 451 593 | 
             
                /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
         | 
| 452 | 
            -
                LLAMA_API void llama_sample_typical( | 
| 453 | 
            -
             | 
| 594 | 
            +
                LLAMA_API void llama_sample_typical(
         | 
| 595 | 
            +
                        struct llama_context * ctx,
         | 
| 596 | 
            +
                      llama_token_data_array * candidates,
         | 
| 597 | 
            +
                                       float   p,
         | 
| 598 | 
            +
                                      size_t   min_keep);
         | 
| 599 | 
            +
             | 
| 600 | 
            +
                LLAMA_API void llama_sample_temp(
         | 
| 601 | 
            +
                        struct llama_context * ctx,
         | 
| 602 | 
            +
                      llama_token_data_array * candidates,
         | 
| 603 | 
            +
                                       float   temp);
         | 
| 604 | 
            +
             | 
| 605 | 
            +
                LLAMA_API DEPRECATED(void llama_sample_temperature(
         | 
| 606 | 
            +
                            struct llama_context * ctx,
         | 
| 607 | 
            +
                          llama_token_data_array * candidates,
         | 
| 608 | 
            +
                                           float   temp),
         | 
| 609 | 
            +
                        "use llama_sample_temp instead");
         | 
| 454 610 |  | 
| 455 611 | 
             
                /// @details Apply constraints from grammar
         | 
| 456 | 
            -
                LLAMA_API void llama_sample_grammar( | 
| 612 | 
            +
                LLAMA_API void llama_sample_grammar(
         | 
| 613 | 
            +
                        struct llama_context * ctx,
         | 
| 614 | 
            +
                      llama_token_data_array * candidates,
         | 
| 615 | 
            +
                  const struct llama_grammar * grammar);
         | 
| 457 616 |  | 
| 458 617 | 
             
                /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
         | 
| 459 618 | 
             
                /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         | 
| @@ -461,23 +620,41 @@ extern "C" { | |
| 461 620 | 
             
                /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
         | 
| 462 621 | 
             
                /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
         | 
| 463 622 | 
             
                /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
         | 
| 464 | 
            -
                LLAMA_API llama_token llama_sample_token_mirostat( | 
| 623 | 
            +
                LLAMA_API llama_token llama_sample_token_mirostat(
         | 
| 624 | 
            +
                        struct llama_context * ctx,
         | 
| 625 | 
            +
                      llama_token_data_array * candidates,
         | 
| 626 | 
            +
                                       float   tau,
         | 
| 627 | 
            +
                                       float   eta,
         | 
| 628 | 
            +
                                         int   m,
         | 
| 629 | 
            +
                                       float * mu);
         | 
| 465 630 |  | 
| 466 631 | 
             
                /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
         | 
| 467 632 | 
             
                /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         | 
| 468 633 | 
             
                /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
         | 
| 469 634 | 
             
                /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
         | 
| 470 635 | 
             
                /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
         | 
| 471 | 
            -
                LLAMA_API llama_token llama_sample_token_mirostat_v2( | 
| 636 | 
            +
                LLAMA_API llama_token llama_sample_token_mirostat_v2(
         | 
| 637 | 
            +
                        struct llama_context * ctx,
         | 
| 638 | 
            +
                      llama_token_data_array * candidates,
         | 
| 639 | 
            +
                                       float   tau,
         | 
| 640 | 
            +
                                       float   eta,
         | 
| 641 | 
            +
                                       float * mu);
         | 
| 472 642 |  | 
| 473 643 | 
             
                /// @details Selects the token with the highest probability.
         | 
| 474 | 
            -
                LLAMA_API llama_token llama_sample_token_greedy( | 
| 644 | 
            +
                LLAMA_API llama_token llama_sample_token_greedy(
         | 
| 645 | 
            +
                        struct llama_context * ctx,
         | 
| 646 | 
            +
                      llama_token_data_array * candidates);
         | 
| 475 647 |  | 
| 476 648 | 
             
                /// @details Randomly selects a token from the candidates based on their probabilities.
         | 
| 477 | 
            -
                LLAMA_API llama_token llama_sample_token( | 
| 649 | 
            +
                LLAMA_API llama_token llama_sample_token(
         | 
| 650 | 
            +
                        struct llama_context * ctx,
         | 
| 651 | 
            +
                      llama_token_data_array * candidates);
         | 
| 478 652 |  | 
| 479 653 | 
             
                /// @details Accepts the sampled token into the grammar
         | 
| 480 | 
            -
                LLAMA_API void llama_grammar_accept_token( | 
| 654 | 
            +
                LLAMA_API void llama_grammar_accept_token(
         | 
| 655 | 
            +
                        struct llama_context * ctx,
         | 
| 656 | 
            +
                        struct llama_grammar * grammar,
         | 
| 657 | 
            +
                                 llama_token   token);
         | 
| 481 658 |  | 
| 482 659 | 
             
                //
         | 
| 483 660 | 
             
                // Beam search
         | 
| @@ -485,9 +662,10 @@ extern "C" { | |
| 485 662 |  | 
| 486 663 | 
             
                struct llama_beam_view {
         | 
| 487 664 | 
             
                    const llama_token * tokens;
         | 
| 665 | 
            +
             | 
| 488 666 | 
             
                    size_t n_tokens;
         | 
| 489 | 
            -
                    float | 
| 490 | 
            -
                    bool | 
| 667 | 
            +
                    float  p;        // Cumulative beam probability (renormalized relative to all beams)
         | 
| 668 | 
            +
                    bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
         | 
| 491 669 | 
             
                };
         | 
| 492 670 |  | 
| 493 671 | 
             
                // Passed to beam_search_callback function.
         | 
| @@ -496,9 +674,10 @@ extern "C" { | |
| 496 674 | 
             
                // These pointers are valid only during the synchronous callback, so should not be saved.
         | 
| 497 675 | 
             
                struct llama_beams_state {
         | 
| 498 676 | 
             
                    struct llama_beam_view * beam_views;
         | 
| 677 | 
            +
             | 
| 499 678 | 
             
                    size_t n_beams;               // Number of elements in beam_views[].
         | 
| 500 679 | 
             
                    size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
         | 
| 501 | 
            -
                    bool | 
| 680 | 
            +
                    bool   last_call;             // True iff this is the last callback invocation.
         | 
| 502 681 | 
             
                };
         | 
| 503 682 |  | 
| 504 683 | 
             
                // Type of pointer to the beam_search_callback function.
         | 
| @@ -513,11 +692,17 @@ extern "C" { | |
| 513 692 | 
             
                /// @param n_beams Number of beams to use.
         | 
| 514 693 | 
             
                /// @param n_past Number of tokens already evaluated.
         | 
| 515 694 | 
             
                /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
         | 
| 516 | 
            -
                 | 
| 517 | 
            -
             | 
| 695 | 
            +
                LLAMA_API void llama_beam_search(
         | 
| 696 | 
            +
                               struct llama_context * ctx,
         | 
| 697 | 
            +
                    llama_beam_search_callback_fn_t   callback,
         | 
| 698 | 
            +
                                               void * callback_data,
         | 
| 699 | 
            +
                                             size_t   n_beams,
         | 
| 700 | 
            +
                                                int   n_past,
         | 
| 701 | 
            +
                                                int   n_predict);
         | 
| 518 702 |  | 
| 519 703 | 
             
                // Performance information
         | 
| 520 704 | 
             
                LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
         | 
| 705 | 
            +
             | 
| 521 706 | 
             
                LLAMA_API void llama_print_timings(struct llama_context * ctx);
         | 
| 522 707 | 
             
                LLAMA_API void llama_reset_timings(struct llama_context * ctx);
         | 
| 523 708 |  | 
| @@ -526,7 +711,7 @@ extern "C" { | |
| 526 711 |  | 
| 527 712 | 
             
                // Set callback for all future logging events.
         | 
| 528 713 | 
             
                // If this is not called, or NULL is supplied, everything is output on stderr.
         | 
| 529 | 
            -
                LLAMA_API void llama_log_set( | 
| 714 | 
            +
                LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
         | 
| 530 715 |  | 
| 531 716 | 
             
                LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
         | 
| 532 717 |  | 
    
        data/lib/llama_cpp/version.rb
    CHANGED
    
    | @@ -3,8 +3,8 @@ | |
| 3 3 | 
             
            # llama_cpp.rb provides Ruby bindings for the llama.cpp.
         | 
| 4 4 | 
             
            module LLaMACpp
         | 
| 5 5 | 
             
              # The version of llama_cpp.rb you install.
         | 
| 6 | 
            -
              VERSION = '0. | 
| 6 | 
            +
              VERSION = '0.6.0'
         | 
| 7 7 |  | 
| 8 8 | 
             
              # The version of llama.cpp bundled with llama_cpp.rb.
         | 
| 9 | 
            -
              LLAMA_CPP_VERSION = ' | 
| 9 | 
            +
              LLAMA_CPP_VERSION = 'b1292'
         | 
| 10 10 | 
             
            end
         |