llama_cpp 0.14.3 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +16 -0
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +1 -0
 - data/ext/llama_cpp/llama_cpp.cpp +27 -0
 - data/lib/llama_cpp/version.rb +2 -2
 - data/sig/llama_cpp.rbs +14 -0
 - data/vendor/tmp/llama.cpp/LICENSE +1 -1
 - data/vendor/tmp/llama.cpp/Makefile +81 -20
 - data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
 - data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
 - data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
 - data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
 - data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
 - data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
 - data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
 - data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
 - data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
 - data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
 - data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
 - data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
 - data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
 - data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
 - data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
 - data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
 - data/vendor/tmp/llama.cpp/ggml.c +141 -101
 - data/vendor/tmp/llama.cpp/ggml.h +18 -12
 - data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
 - data/vendor/tmp/llama.cpp/llama.h +145 -29
 - data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
 - data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
 - data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
 - data/vendor/tmp/llama.cpp/unicode.h +2 -0
 - metadata +5 -3
 
| 
         @@ -37,9 +37,13 @@ 
     | 
|
| 
       37 
37 
     | 
    
         | 
| 
       38 
38 
     | 
    
         
             
            #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
         
     | 
| 
       39 
39 
     | 
    
         
             
            #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
         
     | 
| 
      
 40 
     | 
    
         
            +
            #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
         
     | 
| 
       40 
41 
     | 
    
         | 
| 
       41 
42 
     | 
    
         
             
            #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
         
     | 
| 
       42 
     | 
    
         
            -
            #define LLAMA_SESSION_VERSION  
     | 
| 
      
 43 
     | 
    
         
            +
            #define LLAMA_SESSION_VERSION 5
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
            #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
         
     | 
| 
      
 46 
     | 
    
         
            +
            #define LLAMA_STATE_SEQ_VERSION 1
         
     | 
| 
       43 
47 
     | 
    
         | 
| 
       44 
48 
     | 
    
         
             
            #ifdef __cplusplus
         
     | 
| 
       45 
49 
     | 
    
         
             
            extern "C" {
         
     | 
| 
         @@ -60,9 +64,9 @@ extern "C" { 
     | 
|
| 
       60 
64 
     | 
    
         | 
| 
       61 
65 
     | 
    
         
             
                enum llama_vocab_type {
         
     | 
| 
       62 
66 
     | 
    
         
             
                    LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
         
     | 
| 
       63 
     | 
    
         
            -
                    LLAMA_VOCAB_TYPE_SPM  = 1, //  
     | 
| 
       64 
     | 
    
         
            -
                    LLAMA_VOCAB_TYPE_BPE  = 2, //  
     | 
| 
       65 
     | 
    
         
            -
                    LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
         
     | 
| 
      
 67 
     | 
    
         
            +
                    LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
         
     | 
| 
      
 68 
     | 
    
         
            +
                    LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
         
     | 
| 
      
 69 
     | 
    
         
            +
                    LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
         
     | 
| 
       66 
70 
     | 
    
         
             
                };
         
     | 
| 
       67 
71 
     | 
    
         | 
| 
       68 
72 
     | 
    
         
             
                // note: these values should be synchronized with ggml_rope
         
     | 
| 
         @@ -117,6 +121,7 @@ extern "C" { 
     | 
|
| 
       117 
121 
     | 
    
         
             
                    LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
         
     | 
| 
       118 
122 
     | 
    
         
             
                    LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
         
     | 
| 
       119 
123 
     | 
    
         
             
                    LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
         
     | 
| 
      
 124 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
         
     | 
| 
       120 
125 
     | 
    
         | 
| 
       121 
126 
     | 
    
         
             
                    LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
         
     | 
| 
       122 
127 
     | 
    
         
             
                };
         
     | 
| 
         @@ -275,13 +280,16 @@ extern "C" { 
     | 
|
| 
       275 
280 
     | 
    
         | 
| 
       276 
281 
     | 
    
         
             
                // model quantization parameters
         
     | 
| 
       277 
282 
     | 
    
         
             
                typedef struct llama_model_quantize_params {
         
     | 
| 
       278 
     | 
    
         
            -
                    int32_t nthread; 
     | 
| 
       279 
     | 
    
         
            -
                    enum llama_ftype ftype; 
     | 
| 
       280 
     | 
    
         
            -
                     
     | 
| 
       281 
     | 
    
         
            -
                     
     | 
| 
       282 
     | 
    
         
            -
                    bool  
     | 
| 
       283 
     | 
    
         
            -
                    bool  
     | 
| 
       284 
     | 
    
         
            -
                     
     | 
| 
      
 283 
     | 
    
         
            +
                    int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         
     | 
| 
      
 284 
     | 
    
         
            +
                    enum llama_ftype ftype;              // quantize to this llama_ftype
         
     | 
| 
      
 285 
     | 
    
         
            +
                    enum ggml_type output_tensor_type;   // output tensor type
         
     | 
| 
      
 286 
     | 
    
         
            +
                    enum ggml_type token_embedding_type; // itoken embeddings tensor type
         
     | 
| 
      
 287 
     | 
    
         
            +
                    bool allow_requantize;               // allow quantizing non-f32/f16 tensors
         
     | 
| 
      
 288 
     | 
    
         
            +
                    bool quantize_output_tensor;         // quantize output.weight
         
     | 
| 
      
 289 
     | 
    
         
            +
                    bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         
     | 
| 
      
 290 
     | 
    
         
            +
                    bool pure;                           // quantize all tensors to the default type
         
     | 
| 
      
 291 
     | 
    
         
            +
                    void * imatrix;                      // pointer to importance matrix data
         
     | 
| 
      
 292 
     | 
    
         
            +
                    void * kv_overrides;                 // pointer to vector containing overrides
         
     | 
| 
       285 
293 
     | 
    
         
             
                } llama_model_quantize_params;
         
     | 
| 
       286 
294 
     | 
    
         | 
| 
       287 
295 
     | 
    
         
             
                // grammar types
         
     | 
| 
         @@ -519,6 +527,7 @@ extern "C" { 
     | 
|
| 
       519 
527 
     | 
    
         
             
                        struct llama_context * ctx);
         
     | 
| 
       520 
528 
     | 
    
         | 
| 
       521 
529 
     | 
    
         
             
                // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
         
     | 
| 
      
 530 
     | 
    
         
            +
                // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
         
     | 
| 
       522 
531 
     | 
    
         
             
                // seq_id < 0 : match any sequence
         
     | 
| 
       523 
532 
     | 
    
         
             
                // p0 < 0     : [0,  p1]
         
     | 
| 
       524 
533 
     | 
    
         
             
                // p1 < 0     : [p0, inf)
         
     | 
| 
         @@ -590,35 +599,93 @@ extern "C" { 
     | 
|
| 
       590 
599 
     | 
    
         | 
| 
       591 
600 
     | 
    
         
             
                // Returns the maximum size in bytes of the state (rng, logits, embedding
         
     | 
| 
       592 
601 
     | 
    
         
             
                // and kv_cache) - will often be smaller after compacting tokens
         
     | 
| 
       593 
     | 
    
         
            -
                LLAMA_API size_t  
     | 
| 
      
 602 
     | 
    
         
            +
                LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
         
     | 
| 
      
 603 
     | 
    
         
            +
                LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
         
     | 
| 
      
 604 
     | 
    
         
            +
                    "use llama_state_get_size instead");
         
     | 
| 
       594 
605 
     | 
    
         | 
| 
       595 
606 
     | 
    
         
             
                // Copies the state to the specified destination address.
         
     | 
| 
       596 
607 
     | 
    
         
             
                // Destination needs to have allocated enough memory.
         
     | 
| 
       597 
608 
     | 
    
         
             
                // Returns the number of bytes copied
         
     | 
| 
       598 
     | 
    
         
            -
                LLAMA_API size_t  
     | 
| 
      
 609 
     | 
    
         
            +
                LLAMA_API size_t llama_state_get_data(
         
     | 
| 
       599 
610 
     | 
    
         
             
                        struct llama_context * ctx,
         
     | 
| 
       600 
611 
     | 
    
         
             
                                     uint8_t * dst);
         
     | 
| 
      
 612 
     | 
    
         
            +
                LLAMA_API DEPRECATED(size_t llama_copy_state_data(
         
     | 
| 
      
 613 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 614 
     | 
    
         
            +
                                     uint8_t * dst),
         
     | 
| 
      
 615 
     | 
    
         
            +
                    "use llama_state_get_data instead");
         
     | 
| 
       601 
616 
     | 
    
         | 
| 
       602 
617 
     | 
    
         
             
                // Set the state reading from the specified address
         
     | 
| 
       603 
618 
     | 
    
         
             
                // Returns the number of bytes read
         
     | 
| 
       604 
     | 
    
         
            -
                LLAMA_API size_t  
     | 
| 
      
 619 
     | 
    
         
            +
                LLAMA_API size_t llama_state_set_data(
         
     | 
| 
       605 
620 
     | 
    
         
             
                        struct llama_context * ctx,
         
     | 
| 
       606 
621 
     | 
    
         
             
                               const uint8_t * src);
         
     | 
| 
      
 622 
     | 
    
         
            +
                LLAMA_API DEPRECATED(size_t llama_set_state_data(
         
     | 
| 
      
 623 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 624 
     | 
    
         
            +
                               const uint8_t * src),
         
     | 
| 
      
 625 
     | 
    
         
            +
                    "use llama_state_set_data instead");
         
     | 
| 
       607 
626 
     | 
    
         | 
| 
       608 
627 
     | 
    
         
             
                // Save/load session file
         
     | 
| 
       609 
     | 
    
         
            -
                LLAMA_API bool  
     | 
| 
      
 628 
     | 
    
         
            +
                LLAMA_API bool llama_state_load_file(
         
     | 
| 
       610 
629 
     | 
    
         
             
                        struct llama_context * ctx,
         
     | 
| 
       611 
630 
     | 
    
         
             
                                  const char * path_session,
         
     | 
| 
       612 
631 
     | 
    
         
             
                                 llama_token * tokens_out,
         
     | 
| 
       613 
632 
     | 
    
         
             
                                      size_t   n_token_capacity,
         
     | 
| 
       614 
633 
     | 
    
         
             
                                      size_t * n_token_count_out);
         
     | 
| 
      
 634 
     | 
    
         
            +
                LLAMA_API DEPRECATED(bool llama_load_session_file(
         
     | 
| 
      
 635 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 636 
     | 
    
         
            +
                                  const char * path_session,
         
     | 
| 
      
 637 
     | 
    
         
            +
                                 llama_token * tokens_out,
         
     | 
| 
      
 638 
     | 
    
         
            +
                                      size_t   n_token_capacity,
         
     | 
| 
      
 639 
     | 
    
         
            +
                                      size_t * n_token_count_out),
         
     | 
| 
      
 640 
     | 
    
         
            +
                    "use llama_state_load_file instead");
         
     | 
| 
       615 
641 
     | 
    
         | 
| 
       616 
     | 
    
         
            -
                LLAMA_API bool  
     | 
| 
      
 642 
     | 
    
         
            +
                LLAMA_API bool llama_state_save_file(
         
     | 
| 
      
 643 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 644 
     | 
    
         
            +
                                  const char * path_session,
         
     | 
| 
      
 645 
     | 
    
         
            +
                           const llama_token * tokens,
         
     | 
| 
      
 646 
     | 
    
         
            +
                                      size_t   n_token_count);
         
     | 
| 
      
 647 
     | 
    
         
            +
                LLAMA_API DEPRECATED(bool llama_save_session_file(
         
     | 
| 
       617 
648 
     | 
    
         
             
                        struct llama_context * ctx,
         
     | 
| 
       618 
649 
     | 
    
         
             
                                  const char * path_session,
         
     | 
| 
      
 650 
     | 
    
         
            +
                           const llama_token * tokens,
         
     | 
| 
      
 651 
     | 
    
         
            +
                                      size_t   n_token_count),
         
     | 
| 
      
 652 
     | 
    
         
            +
                    "use llama_state_save_file instead");
         
     | 
| 
      
 653 
     | 
    
         
            +
             
     | 
| 
      
 654 
     | 
    
         
            +
                // Get the exact size needed to copy the KV cache of a single sequence
         
     | 
| 
      
 655 
     | 
    
         
            +
                LLAMA_API size_t llama_state_seq_get_size(
         
     | 
| 
      
 656 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 657 
     | 
    
         
            +
                                llama_seq_id   seq_id);
         
     | 
| 
      
 658 
     | 
    
         
            +
             
     | 
| 
      
 659 
     | 
    
         
            +
                // Copy the KV cache of a single sequence into the specified buffer
         
     | 
| 
      
 660 
     | 
    
         
            +
                LLAMA_API size_t llama_state_seq_get_data(
         
     | 
| 
      
 661 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 662 
     | 
    
         
            +
                                     uint8_t * dst,
         
     | 
| 
      
 663 
     | 
    
         
            +
                                llama_seq_id   seq_id);
         
     | 
| 
      
 664 
     | 
    
         
            +
             
     | 
| 
      
 665 
     | 
    
         
            +
                // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
         
     | 
| 
      
 666 
     | 
    
         
            +
                // Returns:
         
     | 
| 
      
 667 
     | 
    
         
            +
                //  - Positive: Ok
         
     | 
| 
      
 668 
     | 
    
         
            +
                //  - Zero: Failed to load
         
     | 
| 
      
 669 
     | 
    
         
            +
                LLAMA_API size_t llama_state_seq_set_data(
         
     | 
| 
      
 670 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 671 
     | 
    
         
            +
                               const uint8_t * src,
         
     | 
| 
      
 672 
     | 
    
         
            +
                                llama_seq_id   dest_seq_id);
         
     | 
| 
      
 673 
     | 
    
         
            +
             
     | 
| 
      
 674 
     | 
    
         
            +
                LLAMA_API size_t llama_state_seq_save_file(
         
     | 
| 
      
 675 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 676 
     | 
    
         
            +
                                  const char * filepath,
         
     | 
| 
      
 677 
     | 
    
         
            +
                                llama_seq_id   seq_id,
         
     | 
| 
       619 
678 
     | 
    
         
             
                           const llama_token * tokens,
         
     | 
| 
       620 
679 
     | 
    
         
             
                                      size_t   n_token_count);
         
     | 
| 
       621 
680 
     | 
    
         | 
| 
      
 681 
     | 
    
         
            +
                LLAMA_API size_t llama_state_seq_load_file(
         
     | 
| 
      
 682 
     | 
    
         
            +
                        struct llama_context * ctx,
         
     | 
| 
      
 683 
     | 
    
         
            +
                                  const char * filepath,
         
     | 
| 
      
 684 
     | 
    
         
            +
                                llama_seq_id   dest_seq_id,
         
     | 
| 
      
 685 
     | 
    
         
            +
                                 llama_token * tokens_out,
         
     | 
| 
      
 686 
     | 
    
         
            +
                                      size_t   n_token_capacity,
         
     | 
| 
      
 687 
     | 
    
         
            +
                                      size_t * n_token_count_out);
         
     | 
| 
      
 688 
     | 
    
         
            +
             
     | 
| 
       622 
689 
     | 
    
         
             
                //
         
     | 
| 
       623 
690 
     | 
    
         
             
                // Decoding
         
     | 
| 
       624 
691 
     | 
    
         
             
                //
         
     | 
| 
         @@ -674,23 +741,31 @@ extern "C" { 
     | 
|
| 
       674 
741 
     | 
    
         
             
                LLAMA_API void llama_synchronize(struct llama_context * ctx);
         
     | 
| 
       675 
742 
     | 
    
         | 
| 
       676 
743 
     | 
    
         
             
                // Token logits obtained from the last call to llama_decode()
         
     | 
| 
       677 
     | 
    
         
            -
                // The logits for  
     | 
| 
       678 
     | 
    
         
            -
                //  
     | 
| 
       679 
     | 
    
         
            -
                // Rows:  
     | 
| 
      
 744 
     | 
    
         
            +
                // The logits for which llama_batch.logits[i] != 0 are stored contiguously
         
     | 
| 
      
 745 
     | 
    
         
            +
                // in the order they have appeared in the batch.
         
     | 
| 
      
 746 
     | 
    
         
            +
                // Rows: number of tokens for which llama_batch.logits[i] != 0
         
     | 
| 
       680 
747 
     | 
    
         
             
                // Cols: n_vocab
         
     | 
| 
       681 
748 
     | 
    
         
             
                LLAMA_API float * llama_get_logits(struct llama_context * ctx);
         
     | 
| 
       682 
749 
     | 
    
         | 
| 
       683 
     | 
    
         
            -
                // Logits for the ith token. Equivalent to:
         
     | 
| 
       684 
     | 
    
         
            -
                // llama_get_logits(ctx) + i*n_vocab
         
     | 
| 
      
 750 
     | 
    
         
            +
                // Logits for the ith token. For positive indices, Equivalent to:
         
     | 
| 
      
 751 
     | 
    
         
            +
                // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
         
     | 
| 
      
 752 
     | 
    
         
            +
                // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
         
     | 
| 
      
 753 
     | 
    
         
            +
                // returns NULL for invalid ids.
         
     | 
| 
       685 
754 
     | 
    
         
             
                LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
         
     | 
| 
       686 
755 
     | 
    
         | 
| 
       687 
     | 
    
         
            -
                // Get all output token embeddings
         
     | 
| 
       688 
     | 
    
         
            -
                //  
     | 
| 
      
 756 
     | 
    
         
            +
                // Get all output token embeddings.
         
     | 
| 
      
 757 
     | 
    
         
            +
                // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
         
     | 
| 
      
 758 
     | 
    
         
            +
                // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
         
     | 
| 
      
 759 
     | 
    
         
            +
                // in the order they have appeared in the batch.
         
     | 
| 
      
 760 
     | 
    
         
            +
                // shape: [n_outputs*n_embd]
         
     | 
| 
      
 761 
     | 
    
         
            +
                // Otherwise, returns NULL.
         
     | 
| 
       689 
762 
     | 
    
         
             
                LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
         
     | 
| 
       690 
763 
     | 
    
         | 
| 
       691 
     | 
    
         
            -
                // Get the embeddings for the ith token
         
     | 
| 
       692 
     | 
    
         
            -
                // llama_get_embeddings(ctx) + i*n_embd
         
     | 
| 
      
 764 
     | 
    
         
            +
                // Get the embeddings for the ith token. For positive indices, Equivalent to:
         
     | 
| 
      
 765 
     | 
    
         
            +
                // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
         
     | 
| 
      
 766 
     | 
    
         
            +
                // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
         
     | 
| 
       693 
767 
     | 
    
         
             
                // shape: [n_embd] (1-dimensional)
         
     | 
| 
      
 768 
     | 
    
         
            +
                // returns NULL for invalid ids.
         
     | 
| 
       694 
769 
     | 
    
         
             
                LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
         
     | 
| 
       695 
770 
     | 
    
         | 
| 
       696 
771 
     | 
    
         
             
                // Get the embeddings for a sequence id
         
     | 
| 
         @@ -711,6 +786,8 @@ extern "C" { 
     | 
|
| 
       711 
786 
     | 
    
         
             
                // Special tokens
         
     | 
| 
       712 
787 
     | 
    
         
             
                LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
         
     | 
| 
       713 
788 
     | 
    
         
             
                LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
         
     | 
| 
      
 789 
     | 
    
         
            +
                LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
         
     | 
| 
      
 790 
     | 
    
         
            +
                LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
         
     | 
| 
       714 
791 
     | 
    
         
             
                LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
         
     | 
| 
       715 
792 
     | 
    
         | 
| 
       716 
793 
     | 
    
         
             
                // Returns -1 if unknown, 1 for true or 0 for false.
         
     | 
| 
         @@ -733,16 +810,16 @@ extern "C" { 
     | 
|
| 
       733 
810 
     | 
    
         
             
                /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
         
     | 
| 
       734 
811 
     | 
    
         
             
                /// @return Returns the number of tokens on success, no more than n_tokens_max
         
     | 
| 
       735 
812 
     | 
    
         
             
                /// @return Returns a negative number on failure - the number of tokens that would have been returned
         
     | 
| 
       736 
     | 
    
         
            -
                /// @param  
     | 
| 
       737 
     | 
    
         
            -
                /// 
     | 
| 
      
 813 
     | 
    
         
            +
                /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
         
     | 
| 
      
 814 
     | 
    
         
            +
                ///                      as plaintext. Does not insert a leading space.
         
     | 
| 
       738 
815 
     | 
    
         
             
                LLAMA_API int32_t llama_tokenize(
         
     | 
| 
       739 
816 
     | 
    
         
             
                    const struct llama_model * model,
         
     | 
| 
       740 
817 
     | 
    
         
             
                                  const char * text,
         
     | 
| 
       741 
818 
     | 
    
         
             
                                     int32_t   text_len,
         
     | 
| 
       742 
819 
     | 
    
         
             
                                 llama_token * tokens,
         
     | 
| 
       743 
820 
     | 
    
         
             
                                     int32_t   n_tokens_max,
         
     | 
| 
       744 
     | 
    
         
            -
                                        bool    
     | 
| 
       745 
     | 
    
         
            -
                                        bool    
     | 
| 
      
 821 
     | 
    
         
            +
                                        bool   add_special,
         
     | 
| 
      
 822 
     | 
    
         
            +
                                        bool   parse_special);
         
     | 
| 
       746 
823 
     | 
    
         | 
| 
       747 
824 
     | 
    
         
             
                // Token Id -> Piece.
         
     | 
| 
       748 
825 
     | 
    
         
             
                // Uses the vocabulary in the provided context.
         
     | 
| 
         @@ -960,6 +1037,16 @@ extern "C" { 
     | 
|
| 
       960 
1037 
     | 
    
         
             
                                            int32_t   n_past,
         
     | 
| 
       961 
1038 
     | 
    
         
             
                                            int32_t   n_predict);
         
     | 
| 
       962 
1039 
     | 
    
         | 
| 
      
 1040 
     | 
    
         
            +
                /// @details Build a split GGUF final path for this chunk.
         
     | 
| 
      
 1041 
     | 
    
         
            +
                ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
         
     | 
| 
      
 1042 
     | 
    
         
            +
                //  Returns the split_path length.
         
     | 
| 
      
 1043 
     | 
    
         
            +
                LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
         
     | 
| 
      
 1044 
     | 
    
         
            +
             
     | 
| 
      
 1045 
     | 
    
         
            +
                /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
         
     | 
| 
      
 1046 
     | 
    
         
            +
                ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
         
     | 
| 
      
 1047 
     | 
    
         
            +
                //  Returns the split_prefix length.
         
     | 
| 
      
 1048 
     | 
    
         
            +
                LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
         
     | 
| 
      
 1049 
     | 
    
         
            +
             
     | 
| 
       963 
1050 
     | 
    
         
             
                // Performance information
         
     | 
| 
       964 
1051 
     | 
    
         
             
                LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
         
     | 
| 
       965 
1052 
     | 
    
         | 
| 
         @@ -987,10 +1074,39 @@ extern "C" { 
     | 
|
| 
       987 
1074 
     | 
    
         | 
| 
       988 
1075 
     | 
    
         
             
            struct ggml_tensor;
         
     | 
| 
       989 
1076 
     | 
    
         | 
| 
      
 1077 
     | 
    
         
            +
            struct llama_partial_utf8 {
         
     | 
| 
      
 1078 
     | 
    
         
            +
                uint32_t value;    // bit value so far (unshifted)
         
     | 
| 
      
 1079 
     | 
    
         
            +
                int      n_remain; // num bytes remaining; -1 indicates invalid sequence
         
     | 
| 
      
 1080 
     | 
    
         
            +
            };
         
     | 
| 
      
 1081 
     | 
    
         
            +
             
     | 
| 
      
 1082 
     | 
    
         
            +
            struct llama_grammar {
         
     | 
| 
      
 1083 
     | 
    
         
            +
                const std::vector<std::vector<llama_grammar_element>>   rules;
         
     | 
| 
      
 1084 
     | 
    
         
            +
                std::vector<std::vector<const llama_grammar_element *>> stacks;
         
     | 
| 
      
 1085 
     | 
    
         
            +
             
     | 
| 
      
 1086 
     | 
    
         
            +
                // buffer for partially generated UTF-8 sequence from accepted tokens
         
     | 
| 
      
 1087 
     | 
    
         
            +
                llama_partial_utf8                                      partial_utf8;
         
     | 
| 
      
 1088 
     | 
    
         
            +
            };
         
     | 
| 
      
 1089 
     | 
    
         
            +
             
     | 
| 
      
 1090 
     | 
    
         
            +
            struct llama_grammar_candidate {
         
     | 
| 
      
 1091 
     | 
    
         
            +
                size_t               index;
         
     | 
| 
      
 1092 
     | 
    
         
            +
                const uint32_t     * code_points;
         
     | 
| 
      
 1093 
     | 
    
         
            +
                llama_partial_utf8   partial_utf8;
         
     | 
| 
      
 1094 
     | 
    
         
            +
            };
         
     | 
| 
      
 1095 
     | 
    
         
            +
             
     | 
| 
       990 
1096 
     | 
    
         
             
            const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
         
     | 
| 
       991 
1097 
     | 
    
         
             
                struct llama_context * ctx
         
     | 
| 
       992 
1098 
     | 
    
         
             
            );
         
     | 
| 
       993 
1099 
     | 
    
         | 
| 
      
 1100 
     | 
    
         
            +
            void llama_grammar_accept(
         
     | 
| 
      
 1101 
     | 
    
         
            +
                    const std::vector<std::vector<llama_grammar_element>>         & rules,
         
     | 
| 
      
 1102 
     | 
    
         
            +
                    const std::vector<std::vector<const llama_grammar_element *>> & stacks,
         
     | 
| 
      
 1103 
     | 
    
         
            +
                    const uint32_t                                                  chr,
         
     | 
| 
      
 1104 
     | 
    
         
            +
                    std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
         
     | 
| 
      
 1105 
     | 
    
         
            +
             
     | 
| 
      
 1106 
     | 
    
         
            +
            std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         
     | 
| 
      
 1107 
     | 
    
         
            +
                    const std::string & src,
         
     | 
| 
      
 1108 
     | 
    
         
            +
                    llama_partial_utf8   partial_start);
         
     | 
| 
      
 1109 
     | 
    
         
            +
             
     | 
| 
       994 
1110 
     | 
    
         
             
            #endif // LLAMA_API_INTERNAL
         
     | 
| 
       995 
1111 
     | 
    
         | 
| 
       996 
1112 
     | 
    
         
             
            #endif // LLAMA_H
         
     |