llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/CHANGELOG.md +17 -0
 - data/README.md +1 -1
 - data/examples/chat.rb +2 -4
 - data/ext/llama_cpp/extconf.rb +3 -3
 - data/ext/llama_cpp/llama_cpp.cpp +118 -117
 - data/ext/llama_cpp/src/ggml-alloc.c +97 -53
 - data/ext/llama_cpp/src/ggml-alloc.h +4 -0
 - data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
 - data/ext/llama_cpp/src/ggml-cuda.h +32 -23
 - data/ext/llama_cpp/src/ggml-metal.h +9 -3
 - data/ext/llama_cpp/src/ggml-metal.m +142 -161
 - data/ext/llama_cpp/src/ggml-metal.metal +577 -500
 - data/ext/llama_cpp/src/ggml.c +2064 -233
 - data/ext/llama_cpp/src/ggml.h +238 -13
 - data/ext/llama_cpp/src/k_quants.c +110 -54
 - data/ext/llama_cpp/src/llama-util.h +10 -8
 - data/ext/llama_cpp/src/llama.cpp +4544 -2890
 - data/ext/llama_cpp/src/llama.h +133 -123
 - data/lib/llama_cpp/version.rb +2 -2
 - data/lib/llama_cpp.rb +1 -1
 - data/sig/llama_cpp.rbs +8 -8
 - metadata +2 -2
 
    
        data/ext/llama_cpp/src/llama.h
    CHANGED
    
    | 
         @@ -34,29 +34,18 @@ 
     | 
|
| 
       34 
34 
     | 
    
         
             
            #    define DEPRECATED(func, hint) func
         
     | 
| 
       35 
35 
     | 
    
         
             
            #endif
         
     | 
| 
       36 
36 
     | 
    
         | 
| 
       37 
     | 
    
         
            -
            #define  
     | 
| 
       38 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
         
     | 
| 
       39 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
         
     | 
| 
       40 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
         
     | 
| 
       41 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
         
     | 
| 
      
 37 
     | 
    
         
            +
            #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
         
     | 
| 
       42 
38 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
            #define  
     | 
| 
       44 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
         
     | 
| 
       45 
     | 
    
         
            -
            #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
         
     | 
| 
       46 
     | 
    
         
            -
            #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
         
     | 
| 
       47 
     | 
    
         
            -
            #define LLAMA_SESSION_VERSION        1
         
     | 
| 
      
 39 
     | 
    
         
            +
            #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
         
     | 
| 
       48 
40 
     | 
    
         | 
| 
       49 
     | 
    
         
            -
            #define  
     | 
| 
      
 41 
     | 
    
         
            +
            #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
         
     | 
| 
      
 42 
     | 
    
         
            +
            #define LLAMA_SESSION_VERSION 1
         
     | 
| 
       50 
43 
     | 
    
         | 
| 
       51 
44 
     | 
    
         
             
            #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
         
     | 
| 
       52 
45 
     | 
    
         
             
            // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
         
     | 
| 
       53 
46 
     | 
    
         
             
            #define LLAMA_SUPPORTS_GPU_OFFLOAD
         
     | 
| 
       54 
47 
     | 
    
         
             
            #endif
         
     | 
| 
       55 
48 
     | 
    
         | 
| 
       56 
     | 
    
         
            -
            #ifndef LLAMA_DEFAULT_RMS_EPS
         
     | 
| 
       57 
     | 
    
         
            -
            #define LLAMA_DEFAULT_RMS_EPS 5e-6f
         
     | 
| 
       58 
     | 
    
         
            -
            #endif
         
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
49 
     | 
    
         
             
            #ifdef __cplusplus
         
     | 
| 
       61 
50 
     | 
    
         
             
            extern "C" {
         
     | 
| 
       62 
51 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -72,6 +61,52 @@ extern "C" { 
     | 
|
| 
       72 
61 
     | 
    
         | 
| 
       73 
62 
     | 
    
         
             
                typedef int llama_token;
         
     | 
| 
       74 
63 
     | 
    
         | 
| 
      
 64 
     | 
    
         
            +
                enum llama_log_level {
         
     | 
| 
      
 65 
     | 
    
         
            +
                    LLAMA_LOG_LEVEL_ERROR = 2,
         
     | 
| 
      
 66 
     | 
    
         
            +
                    LLAMA_LOG_LEVEL_WARN  = 3,
         
     | 
| 
      
 67 
     | 
    
         
            +
                    LLAMA_LOG_LEVEL_INFO  = 4
         
     | 
| 
      
 68 
     | 
    
         
            +
                };
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                enum llama_vocab_type {
         
     | 
| 
      
 71 
     | 
    
         
            +
                    LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
         
     | 
| 
      
 72 
     | 
    
         
            +
                    LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
         
     | 
| 
      
 73 
     | 
    
         
            +
                };
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
                enum llama_token_type {
         
     | 
| 
      
 76 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
         
     | 
| 
      
 77 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_NORMAL       = 1,
         
     | 
| 
      
 78 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
         
     | 
| 
      
 79 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_CONTROL      = 3,
         
     | 
| 
      
 80 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
         
     | 
| 
      
 81 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_UNUSED       = 5,
         
     | 
| 
      
 82 
     | 
    
         
            +
                    LLAMA_TOKEN_TYPE_BYTE         = 6,
         
     | 
| 
      
 83 
     | 
    
         
            +
                };
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                // model file types
         
     | 
| 
      
 86 
     | 
    
         
            +
                enum llama_ftype {
         
     | 
| 
      
 87 
     | 
    
         
            +
                    LLAMA_FTYPE_ALL_F32              = 0,
         
     | 
| 
      
 88 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
         
     | 
| 
      
 89 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
         
     | 
| 
      
 90 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
         
     | 
| 
      
 91 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         
     | 
| 
      
 92 
     | 
    
         
            +
                    // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
         
     | 
| 
      
 93 
     | 
    
         
            +
                    // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
         
     | 
| 
      
 94 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
         
     | 
| 
      
 95 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
         
     | 
| 
      
 96 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
         
     | 
| 
      
 97 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
         
     | 
| 
      
 98 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
         
     | 
| 
      
 99 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
         
     | 
| 
      
 100 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
         
     | 
| 
      
 101 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
         
     | 
| 
      
 102 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
         
     | 
| 
      
 103 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
         
     | 
| 
      
 104 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
         
     | 
| 
      
 105 
     | 
    
         
            +
                    LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                    LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
         
     | 
| 
      
 108 
     | 
    
         
            +
                };
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
       75 
110 
     | 
    
         
             
                typedef struct llama_token_data {
         
     | 
| 
       76 
111 
     | 
    
         
             
                    llama_token id; // token id
         
     | 
| 
       77 
112 
     | 
    
         
             
                    float logit;    // log-odds of the token
         
     | 
| 
         @@ -86,25 +121,10 @@ extern "C" { 
     | 
|
| 
       86 
121 
     | 
    
         | 
| 
       87 
122 
     | 
    
         
             
                typedef void (*llama_progress_callback)(float progress, void *ctx);
         
     | 
| 
       88 
123 
     | 
    
         | 
| 
       89 
     | 
    
         
            -
                enum llama_log_level {
         
     | 
| 
       90 
     | 
    
         
            -
                    LLAMA_LOG_LEVEL_ERROR = 2,
         
     | 
| 
       91 
     | 
    
         
            -
                    LLAMA_LOG_LEVEL_WARN  = 3,
         
     | 
| 
       92 
     | 
    
         
            -
                    LLAMA_LOG_LEVEL_INFO  = 4
         
     | 
| 
       93 
     | 
    
         
            -
                };
         
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
                // Signature for logging events
         
     | 
| 
       96 
     | 
    
         
            -
                // Note that text includes the new line character at the end for most events.
         
     | 
| 
       97 
     | 
    
         
            -
                // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
         
     | 
| 
       98 
     | 
    
         
            -
                // if it exists.
         
     | 
| 
       99 
     | 
    
         
            -
                // It might not exist for progress report where '.' is output repeatedly.
         
     | 
| 
       100 
     | 
    
         
            -
                typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
         
     | 
| 
       101 
     | 
    
         
            -
             
     | 
| 
       102 
124 
     | 
    
         
             
                struct llama_context_params {
         
     | 
| 
       103 
125 
     | 
    
         
             
                    uint32_t seed;         // RNG seed, -1 for random
         
     | 
| 
       104 
126 
     | 
    
         
             
                    int32_t  n_ctx;        // text context
         
     | 
| 
       105 
127 
     | 
    
         
             
                    int32_t  n_batch;      // prompt processing batch size
         
     | 
| 
       106 
     | 
    
         
            -
                    int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
         
     | 
| 
       107 
     | 
    
         
            -
                    float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
         
     | 
| 
       108 
128 
     | 
    
         
             
                    int32_t  n_gpu_layers; // number of layers to store in VRAM
         
     | 
| 
       109 
129 
     | 
    
         
             
                    int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
         
     | 
| 
       110 
130 
     | 
    
         | 
| 
         @@ -129,33 +149,18 @@ extern "C" { 
     | 
|
| 
       129 
149 
     | 
    
         
             
                    bool use_mlock;  // force system to keep model in RAM
         
     | 
| 
       130 
150 
     | 
    
         
             
                    bool embedding;  // embedding mode only
         
     | 
| 
       131 
151 
     | 
    
         
             
                };
         
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
     | 
    
         
            -
                 
     | 
| 
       134 
     | 
    
         
            -
             
     | 
| 
       135 
     | 
    
         
            -
             
     | 
| 
       136 
     | 
    
         
            -
             
     | 
| 
       137 
     | 
    
         
            -
             
     | 
| 
       138 
     | 
    
         
            -
             
     | 
| 
       139 
     | 
    
         
            -
                    // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
         
     | 
| 
       140 
     | 
    
         
            -
                    // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
         
     | 
| 
       141 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
         
     | 
| 
       142 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
         
     | 
| 
       143 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
         
     | 
| 
       144 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
         
     | 
| 
       145 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
         
     | 
| 
       146 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
         
     | 
| 
       147 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
         
     | 
| 
       148 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
         
     | 
| 
       149 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
         
     | 
| 
       150 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
         
     | 
| 
       151 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
         
     | 
| 
       152 
     | 
    
         
            -
                    LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
         
     | 
| 
       153 
     | 
    
         
            -
                };
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
                // Signature for logging events
         
     | 
| 
      
 154 
     | 
    
         
            +
                // Note that text includes the new line character at the end for most events.
         
     | 
| 
      
 155 
     | 
    
         
            +
                // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
         
     | 
| 
      
 156 
     | 
    
         
            +
                // if it exists.
         
     | 
| 
      
 157 
     | 
    
         
            +
                // It might not exist for progress report where '.' is output repeatedly.
         
     | 
| 
      
 158 
     | 
    
         
            +
                typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
         
     | 
| 
       154 
159 
     | 
    
         | 
| 
       155 
160 
     | 
    
         
             
                // model quantization parameters
         
     | 
| 
       156 
161 
     | 
    
         
             
                typedef struct llama_model_quantize_params {
         
     | 
| 
       157 
162 
     | 
    
         
             
                    int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         
     | 
| 
       158 
     | 
    
         
            -
                    enum llama_ftype 
     | 
| 
      
 163 
     | 
    
         
            +
                    enum llama_ftype ftype;      // quantize to this llama_ftype
         
     | 
| 
       159 
164 
     | 
    
         
             
                    bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         
     | 
| 
       160 
165 
     | 
    
         
             
                    bool quantize_output_tensor; // quantize output.weight
         
     | 
| 
       161 
166 
     | 
    
         
             
                } llama_model_quantize_params;
         
     | 
| 
         @@ -208,27 +213,16 @@ extern "C" { 
     | 
|
| 
       208 
213 
     | 
    
         
             
                    int32_t n_eval;
         
     | 
| 
       209 
214 
     | 
    
         
             
                };
         
     | 
| 
       210 
215 
     | 
    
         | 
| 
       211 
     | 
    
         
            -
                 
     | 
| 
       212 
     | 
    
         
            -
                 
     | 
| 
       213 
     | 
    
         
            -
                LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
         
     | 
| 
       214 
     | 
    
         
            -
             
     | 
| 
       215 
     | 
    
         
            -
                LLAMA_API int llama_max_devices();
         
     | 
| 
       216 
     | 
    
         
            -
             
     | 
| 
       217 
     | 
    
         
            -
                LLAMA_API struct llama_context_params llama_context_default_params();
         
     | 
| 
       218 
     | 
    
         
            -
                LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
         
     | 
| 
      
 216 
     | 
    
         
            +
                LLAMA_API struct llama_context_params llama_context_default_params(void);
         
     | 
| 
      
 217 
     | 
    
         
            +
                LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
         
     | 
| 
       219 
218 
     | 
    
         | 
| 
       220 
     | 
    
         
            -
                LLAMA_API bool llama_mmap_supported();
         
     | 
| 
       221 
     | 
    
         
            -
                LLAMA_API bool llama_mlock_supported();
         
     | 
| 
       222 
     | 
    
         
            -
             
     | 
| 
       223 
     | 
    
         
            -
                // TODO: not great API - very likely to change
         
     | 
| 
       224 
219 
     | 
    
         
             
                // Initialize the llama + ggml backend
         
     | 
| 
       225 
220 
     | 
    
         
             
                // If numa is true, use NUMA optimizations
         
     | 
| 
       226 
221 
     | 
    
         
             
                // Call once at the start of the program
         
     | 
| 
       227 
222 
     | 
    
         
             
                LLAMA_API void llama_backend_init(bool numa);
         
     | 
| 
       228 
     | 
    
         
            -
                // Call once at the end of the program - currently only used for MPI
         
     | 
| 
       229 
     | 
    
         
            -
                LLAMA_API void llama_backend_free();
         
     | 
| 
       230 
223 
     | 
    
         | 
| 
       231 
     | 
    
         
            -
                 
     | 
| 
      
 224 
     | 
    
         
            +
                // Call once at the end of the program - currently only used for MPI
         
     | 
| 
      
 225 
     | 
    
         
            +
                LLAMA_API void llama_backend_free(void);
         
     | 
| 
       232 
226 
     | 
    
         | 
| 
       233 
227 
     | 
    
         
             
                LLAMA_API struct llama_model * llama_load_model_from_file(
         
     | 
| 
       234 
228 
     | 
    
         
             
                                         const char * path_model,
         
     | 
| 
         @@ -240,17 +234,28 @@ extern "C" { 
     | 
|
| 
       240 
234 
     | 
    
         
             
                                 struct llama_model * model,
         
     | 
| 
       241 
235 
     | 
    
         
             
                        struct llama_context_params   params);
         
     | 
| 
       242 
236 
     | 
    
         | 
| 
       243 
     | 
    
         
            -
                // Various functions for loading a ggml llama model.
         
     | 
| 
       244 
     | 
    
         
            -
                // Allocate (almost) all memory needed for the model.
         
     | 
| 
       245 
     | 
    
         
            -
                // Return NULL on failure
         
     | 
| 
       246 
     | 
    
         
            -
                LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
         
     | 
| 
       247 
     | 
    
         
            -
                                         const char * path_model,
         
     | 
| 
       248 
     | 
    
         
            -
                        struct llama_context_params   params),
         
     | 
| 
       249 
     | 
    
         
            -
                        "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
         
     | 
| 
       250 
     | 
    
         
            -
             
     | 
| 
       251 
237 
     | 
    
         
             
                // Frees all allocated memory
         
     | 
| 
       252 
238 
     | 
    
         
             
                LLAMA_API void llama_free(struct llama_context * ctx);
         
     | 
| 
       253 
239 
     | 
    
         | 
| 
      
 240 
     | 
    
         
            +
                LLAMA_API int64_t llama_time_us(void);
         
     | 
| 
      
 241 
     | 
    
         
            +
             
     | 
| 
      
 242 
     | 
    
         
            +
                LLAMA_API int  llama_max_devices    (void);
         
     | 
| 
      
 243 
     | 
    
         
            +
                LLAMA_API bool llama_mmap_supported (void);
         
     | 
| 
      
 244 
     | 
    
         
            +
                LLAMA_API bool llama_mlock_supported(void);
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     | 
| 
      
 246 
     | 
    
         
            +
                LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
         
     | 
| 
      
 247 
     | 
    
         
            +
                LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
         
     | 
| 
      
 248 
     | 
    
         
            +
                LLAMA_API int llama_n_embd (const struct llama_context * ctx);
         
     | 
| 
      
 249 
     | 
    
         
            +
             
     | 
| 
      
 250 
     | 
    
         
            +
                LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
         
     | 
| 
      
 251 
     | 
    
         
            +
             
     | 
| 
      
 252 
     | 
    
         
            +
                LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
         
     | 
| 
      
 253 
     | 
    
         
            +
                LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
         
     | 
| 
      
 254 
     | 
    
         
            +
                LLAMA_API int llama_model_n_embd (const struct llama_model * model);
         
     | 
| 
      
 255 
     | 
    
         
            +
             
     | 
| 
      
 256 
     | 
    
         
            +
                // Get a string describing the model type
         
     | 
| 
      
 257 
     | 
    
         
            +
                LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
         
     | 
| 
      
 258 
     | 
    
         
            +
             
     | 
| 
       254 
259 
     | 
    
         
             
                // Returns 0 on success
         
     | 
| 
       255 
260 
     | 
    
         
             
                LLAMA_API int llama_model_quantize(
         
     | 
| 
       256 
261 
     | 
    
         
             
                        const char * fname_inp,
         
     | 
| 
         @@ -272,9 +277,9 @@ extern "C" { 
     | 
|
| 
       272 
277 
     | 
    
         | 
| 
       273 
278 
     | 
    
         
             
                LLAMA_API int llama_model_apply_lora_from_file(
         
     | 
| 
       274 
279 
     | 
    
         
             
                        const struct llama_model * model,
         
     | 
| 
       275 
     | 
    
         
            -
             
     | 
| 
       276 
     | 
    
         
            -
             
     | 
| 
       277 
     | 
    
         
            -
             
     | 
| 
      
 280 
     | 
    
         
            +
                                      const char * path_lora,
         
     | 
| 
      
 281 
     | 
    
         
            +
                                      const char * path_base_model,
         
     | 
| 
      
 282 
     | 
    
         
            +
                                             int   n_threads);
         
     | 
| 
       278 
283 
     | 
    
         | 
| 
       279 
284 
     | 
    
         
             
                // Returns the number of tokens in the KV cache
         
     | 
| 
       280 
285 
     | 
    
         
             
                LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
         
     | 
| 
         @@ -324,11 +329,40 @@ extern "C" { 
     | 
|
| 
       324 
329 
     | 
    
         
             
                // IMPORTANT: do not use for anything else other than debugging and testing!
         
     | 
| 
       325 
330 
     | 
    
         
             
                LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
         
     | 
| 
       326 
331 
     | 
    
         | 
| 
      
 332 
     | 
    
         
            +
                // Token logits obtained from the last call to llama_eval()
         
     | 
| 
      
 333 
     | 
    
         
            +
                // The logits for the last token are stored in the last row
         
     | 
| 
      
 334 
     | 
    
         
            +
                // Can be mutated in order to change the probabilities of the next token
         
     | 
| 
      
 335 
     | 
    
         
            +
                // Rows: n_tokens
         
     | 
| 
      
 336 
     | 
    
         
            +
                // Cols: n_vocab
         
     | 
| 
      
 337 
     | 
    
         
            +
                LLAMA_API float * llama_get_logits(struct llama_context * ctx);
         
     | 
| 
      
 338 
     | 
    
         
            +
             
     | 
| 
      
 339 
     | 
    
         
            +
                // Get the embeddings for the input
         
     | 
| 
      
 340 
     | 
    
         
            +
                // shape: [n_embd] (1-dimensional)
         
     | 
| 
      
 341 
     | 
    
         
            +
                LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
         
     | 
| 
      
 342 
     | 
    
         
            +
             
     | 
| 
      
 343 
     | 
    
         
            +
                //
         
     | 
| 
      
 344 
     | 
    
         
            +
                // Vocab
         
     | 
| 
      
 345 
     | 
    
         
            +
                //
         
     | 
| 
      
 346 
     | 
    
         
            +
             
     | 
| 
      
 347 
     | 
    
         
            +
                LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
         
     | 
| 
      
 348 
     | 
    
         
            +
             
     | 
| 
      
 349 
     | 
    
         
            +
                LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
         
     | 
| 
      
 350 
     | 
    
         
            +
             
     | 
| 
      
 351 
     | 
    
         
            +
                LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
      
 353 
     | 
    
         
            +
                // Special tokens
         
     | 
| 
      
 354 
     | 
    
         
            +
                LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
         
     | 
| 
      
 355 
     | 
    
         
            +
                LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
         
     | 
| 
      
 356 
     | 
    
         
            +
                LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
         
     | 
| 
      
 357 
     | 
    
         
            +
             
     | 
| 
      
 358 
     | 
    
         
            +
                //
         
     | 
| 
      
 359 
     | 
    
         
            +
                // Tokenization
         
     | 
| 
      
 360 
     | 
    
         
            +
                //
         
     | 
| 
      
 361 
     | 
    
         
            +
             
     | 
| 
       327 
362 
     | 
    
         
             
                // Convert the provided text into tokens.
         
     | 
| 
       328 
363 
     | 
    
         
             
                // The tokens pointer must be large enough to hold the resulting tokens.
         
     | 
| 
       329 
364 
     | 
    
         
             
                // Returns the number of tokens on success, no more than n_max_tokens
         
     | 
| 
       330 
365 
     | 
    
         
             
                // Returns a negative number on failure - the number of tokens that would have been returned
         
     | 
| 
       331 
     | 
    
         
            -
                // TODO: not sure if correct
         
     | 
| 
       332 
366 
     | 
    
         
             
                LLAMA_API int llama_tokenize(
         
     | 
| 
       333 
367 
     | 
    
         
             
                        struct llama_context * ctx,
         
     | 
| 
       334 
368 
     | 
    
         
             
                                  const char * text,
         
     | 
| 
         @@ -343,55 +377,24 @@ extern "C" { 
     | 
|
| 
       343 
377 
     | 
    
         
             
                                         int   n_max_tokens,
         
     | 
| 
       344 
378 
     | 
    
         
             
                                        bool   add_bos);
         
     | 
| 
       345 
379 
     | 
    
         | 
| 
       346 
     | 
    
         
            -
                LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
         
     | 
| 
       347 
     | 
    
         
            -
                LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
         
     | 
| 
       348 
     | 
    
         
            -
                LLAMA_API int llama_n_embd (const struct llama_context * ctx);
         
     | 
| 
       349 
     | 
    
         
            -
             
     | 
| 
       350 
     | 
    
         
            -
                LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
         
     | 
| 
       351 
     | 
    
         
            -
                LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
         
     | 
| 
       352 
     | 
    
         
            -
                LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
         
     | 
| 
       353 
     | 
    
         
            -
             
     | 
| 
       354 
     | 
    
         
            -
                // Get the vocabulary as output parameters.
         
     | 
| 
       355 
     | 
    
         
            -
                // Returns number of results.
         
     | 
| 
       356 
     | 
    
         
            -
                LLAMA_API int llama_get_vocab(
         
     | 
| 
       357 
     | 
    
         
            -
                        const struct llama_context * ctx,
         
     | 
| 
       358 
     | 
    
         
            -
                                      const char * * strings,
         
     | 
| 
       359 
     | 
    
         
            -
                                             float * scores,
         
     | 
| 
       360 
     | 
    
         
            -
                                               int   capacity);
         
     | 
| 
       361 
     | 
    
         
            -
             
     | 
| 
       362 
     | 
    
         
            -
                LLAMA_API int llama_get_vocab_from_model(
         
     | 
| 
       363 
     | 
    
         
            -
                          const struct llama_model * model,
         
     | 
| 
       364 
     | 
    
         
            -
                                      const char * * strings,
         
     | 
| 
       365 
     | 
    
         
            -
                                             float * scores,
         
     | 
| 
       366 
     | 
    
         
            -
                                               int   capacity);
         
     | 
| 
       367 
     | 
    
         
            -
             
     | 
| 
       368 
     | 
    
         
            -
                // Token logits obtained from the last call to llama_eval()
         
     | 
| 
       369 
     | 
    
         
            -
                // The logits for the last token are stored in the last row
         
     | 
| 
       370 
     | 
    
         
            -
                // Can be mutated in order to change the probabilities of the next token
         
     | 
| 
       371 
     | 
    
         
            -
                // Rows: n_tokens
         
     | 
| 
       372 
     | 
    
         
            -
                // Cols: n_vocab
         
     | 
| 
       373 
     | 
    
         
            -
                LLAMA_API float * llama_get_logits(struct llama_context * ctx);
         
     | 
| 
       374 
     | 
    
         
            -
             
     | 
| 
       375 
     | 
    
         
            -
                // Get the embeddings for the input
         
     | 
| 
       376 
     | 
    
         
            -
                // shape: [n_embd] (1-dimensional)
         
     | 
| 
       377 
     | 
    
         
            -
                LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
         
     | 
| 
       378 
     | 
    
         
            -
             
     | 
| 
       379 
380 
     | 
    
         
             
                // Token Id -> String. Uses the vocabulary in the provided context
         
     | 
| 
       380 
     | 
    
         
            -
                 
     | 
| 
      
 381 
     | 
    
         
            +
                // Does not write null terminator to the buffer
         
     | 
| 
      
 382 
     | 
    
         
            +
                LLAMA_API int llama_token_to_str(
         
     | 
| 
       381 
383 
     | 
    
         
             
                        const struct llama_context * ctx,
         
     | 
| 
       382 
     | 
    
         
            -
                                       llama_token   token 
     | 
| 
      
 384 
     | 
    
         
            +
                                       llama_token   token,
         
     | 
| 
      
 385 
     | 
    
         
            +
                                              char * buf,
         
     | 
| 
      
 386 
     | 
    
         
            +
                                              int    length);
         
     | 
| 
       383 
387 
     | 
    
         | 
| 
       384 
     | 
    
         
            -
                LLAMA_API  
     | 
| 
      
 388 
     | 
    
         
            +
                LLAMA_API int llama_token_to_str_with_model(
         
     | 
| 
       385 
389 
     | 
    
         
             
                          const struct llama_model * model,
         
     | 
| 
       386 
     | 
    
         
            -
                                       llama_token   token 
     | 
| 
       387 
     | 
    
         
            -
             
     | 
| 
       388 
     | 
    
         
            -
                 
     | 
| 
       389 
     | 
    
         
            -
                LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
         
     | 
| 
       390 
     | 
    
         
            -
                LLAMA_API llama_token llama_token_eos();  // end-of-sentence
         
     | 
| 
       391 
     | 
    
         
            -
                LLAMA_API llama_token llama_token_nl();   // next-line
         
     | 
| 
      
 390 
     | 
    
         
            +
                                       llama_token   token,
         
     | 
| 
      
 391 
     | 
    
         
            +
                                              char * buf,
         
     | 
| 
      
 392 
     | 
    
         
            +
                                              int    length);
         
     | 
| 
       392 
393 
     | 
    
         | 
| 
      
 394 
     | 
    
         
            +
                //
         
     | 
| 
       393 
395 
     | 
    
         
             
                // Grammar
         
     | 
| 
       394 
396 
     | 
    
         
             
                //
         
     | 
| 
      
 397 
     | 
    
         
            +
             
     | 
| 
       395 
398 
     | 
    
         
             
                LLAMA_API struct llama_grammar * llama_grammar_init(
         
     | 
| 
       396 
399 
     | 
    
         
             
                        const llama_grammar_element ** rules,
         
     | 
| 
       397 
400 
     | 
    
         
             
                                             size_t    n_rules,
         
     | 
| 
         @@ -399,7 +402,9 @@ extern "C" { 
     | 
|
| 
       399 
402 
     | 
    
         | 
| 
       400 
403 
     | 
    
         
             
                LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
         
     | 
| 
       401 
404 
     | 
    
         | 
| 
      
 405 
     | 
    
         
            +
                //
         
     | 
| 
       402 
406 
     | 
    
         
             
                // Sampling functions
         
     | 
| 
      
 407 
     | 
    
         
            +
                //
         
     | 
| 
       403 
408 
     | 
    
         | 
| 
       404 
409 
     | 
    
         
             
                /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
         
     | 
| 
       405 
410 
     | 
    
         
             
                LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
         
     | 
| 
         @@ -468,6 +473,10 @@ extern "C" { 
     | 
|
| 
       468 
473 
     | 
    
         
             
                // Print system information
         
     | 
| 
       469 
474 
     | 
    
         
             
                LLAMA_API const char * llama_print_system_info(void);
         
     | 
| 
       470 
475 
     | 
    
         | 
| 
      
 476 
     | 
    
         
            +
                // Set callback for all future logging events.
         
     | 
| 
      
 477 
     | 
    
         
            +
                // If this is not called, or NULL is supplied, everything is output on stderr.
         
     | 
| 
      
 478 
     | 
    
         
            +
                LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
         
     | 
| 
      
 479 
     | 
    
         
            +
             
     | 
| 
       471 
480 
     | 
    
         
             
            #ifdef __cplusplus
         
     | 
| 
       472 
481 
     | 
    
         
             
            }
         
     | 
| 
       473 
482 
     | 
    
         
             
            #endif
         
     | 
| 
         @@ -477,10 +486,11 @@ extern "C" { 
     | 
|
| 
       477 
486 
     | 
    
         | 
| 
       478 
487 
     | 
    
         
             
            #include <vector>
         
     | 
| 
       479 
488 
     | 
    
         
             
            #include <string>
         
     | 
| 
      
 489 
     | 
    
         
            +
             
     | 
| 
       480 
490 
     | 
    
         
             
            struct ggml_tensor;
         
     | 
| 
       481 
491 
     | 
    
         | 
| 
       482 
492 
     | 
    
         
             
            const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
         
     | 
| 
       483 
493 
     | 
    
         | 
| 
       484 
     | 
    
         
            -
            #endif
         
     | 
| 
      
 494 
     | 
    
         
            +
            #endif // LLAMA_API_INTERNAL
         
     | 
| 
       485 
495 
     | 
    
         | 
| 
       486 
496 
     | 
    
         
             
            #endif // LLAMA_H
         
     | 
    
        data/lib/llama_cpp/version.rb
    CHANGED
    
    | 
         @@ -3,8 +3,8 @@ 
     | 
|
| 
       3 
3 
     | 
    
         
             
            # llama_cpp.rb provides Ruby bindings for the llama.cpp.
         
     | 
| 
       4 
4 
     | 
    
         
             
            module LLaMACpp
         
     | 
| 
       5 
5 
     | 
    
         
             
              # The version of llama_cpp.rb you install.
         
     | 
| 
       6 
     | 
    
         
            -
              VERSION = '0. 
     | 
| 
      
 6 
     | 
    
         
            +
              VERSION = '0.4.0'
         
     | 
| 
       7 
7 
     | 
    
         | 
| 
       8 
8 
     | 
    
         
             
              # The version of llama.cpp bundled with llama_cpp.rb.
         
     | 
| 
       9 
     | 
    
         
            -
              LLAMA_CPP_VERSION = ' 
     | 
| 
      
 9 
     | 
    
         
            +
              LLAMA_CPP_VERSION = 'b1060'
         
     | 
| 
       10 
10 
     | 
    
         
             
            end
         
     | 
    
        data/lib/llama_cpp.rb
    CHANGED
    
    | 
         @@ -101,7 +101,7 @@ module LLaMACpp 
     | 
|
| 
       101 
101 
     | 
    
         | 
| 
       102 
102 
     | 
    
         
             
                  embd.each { |token| output << context.token_to_str(token) }
         
     | 
| 
       103 
103 
     | 
    
         | 
| 
       104 
     | 
    
         
            -
                  break if !embd.empty? && embd[-1] ==  
     | 
| 
      
 104 
     | 
    
         
            +
                  break if !embd.empty? && embd[-1] == context.token_eos
         
     | 
| 
       105 
105 
     | 
    
         
             
                end
         
     | 
| 
       106 
106 
     | 
    
         | 
| 
       107 
107 
     | 
    
         
             
                output.join.delete_prefix(spaced_prompt).strip
         
     | 
    
        data/sig/llama_cpp.rbs
    CHANGED
    
    | 
         @@ -1,9 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module LLaMACpp
         
     | 
| 
       2 
2 
     | 
    
         
             
              VERSION: String
         
     | 
| 
       3 
3 
     | 
    
         
             
              LLAMA_CPP_VERSION: String
         
     | 
| 
       4 
     | 
    
         
            -
              LLAMA_FILE_VERSION: String
         
     | 
| 
       5 
     | 
    
         
            -
              LLAMA_FILE_MAGIC: String
         
     | 
| 
       6 
     | 
    
         
            -
              LLAMA_FILE_MAGIC_UNVERSIONED: String
         
     | 
| 
       7 
4 
     | 
    
         
             
              LLAMA_DEFALUT_SEED: String
         
     | 
| 
       8 
5 
     | 
    
         | 
| 
       9 
6 
     | 
    
         
             
              LLAMA_MAX_DEVICES: Integer
         
     | 
| 
         @@ -42,9 +39,7 @@ module LLaMACpp 
     | 
|
| 
       42 
39 
     | 
    
         
             
                ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
         
     | 
| 
       43 
40 
     | 
    
         
             
                ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
         
     | 
| 
       44 
41 
     | 
    
         
             
              def self?.print_system_info: () -> void
         
     | 
| 
       45 
     | 
    
         
            -
              def self?. 
     | 
| 
       46 
     | 
    
         
            -
              def self?.token_eos: () -> Integer
         
     | 
| 
       47 
     | 
    
         
            -
              def self?.token_nl: () -> Integer
         
     | 
| 
      
 42 
     | 
    
         
            +
              def self?.time_us: () -> Integer
         
     | 
| 
       48 
43 
     | 
    
         
             
              def self?.mmap_supported?: () -> bool
         
     | 
| 
       49 
44 
     | 
    
         
             
              def self?.mlock_supported?: () -> bool
         
     | 
| 
       50 
45 
     | 
    
         
             
              def self?.max_devices: () -> Integer
         
     | 
| 
         @@ -81,9 +76,9 @@ module LLaMACpp 
     | 
|
| 
       81 
76 
     | 
    
         
             
                def n_vocab: () -> Integer
         
     | 
| 
       82 
77 
     | 
    
         
             
                def n_ctx: () -> Integer
         
     | 
| 
       83 
78 
     | 
    
         
             
                def n_embd: () -> Integer
         
     | 
| 
       84 
     | 
    
         
            -
                def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
         
     | 
| 
       85 
79 
     | 
    
         
             
                def token_to_str: (Integer) -> String
         
     | 
| 
       86 
80 
     | 
    
         
             
                def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
         
     | 
| 
      
 81 
     | 
    
         
            +
                def type: () -> String
         
     | 
| 
       87 
82 
     | 
    
         
             
              end
         
     | 
| 
       88 
83 
     | 
    
         | 
| 
       89 
84 
     | 
    
         
             
              class Timings
         
     | 
| 
         @@ -105,6 +100,12 @@ module LLaMACpp 
     | 
|
| 
       105 
100 
     | 
    
         | 
| 
       106 
101 
     | 
    
         
             
                def initialize: (model: ::LLaMACpp::Model) -> void
         
     | 
| 
       107 
102 
     | 
    
         
             
                def embeddings: () -> Array[Float]
         
     | 
| 
      
 103 
     | 
    
         
            +
                def text: (Integer) -> String
         
     | 
| 
      
 104 
     | 
    
         
            +
                def score: (Integer) -> Float
         
     | 
| 
      
 105 
     | 
    
         
            +
                def type: (Integer) -> Integer
         
     | 
| 
      
 106 
     | 
    
         
            +
                def token_bos: () -> Integer
         
     | 
| 
      
 107 
     | 
    
         
            +
                def token_eos: () -> Integer
         
     | 
| 
      
 108 
     | 
    
         
            +
                def token_nl: () -> Integer
         
     | 
| 
       108 
109 
     | 
    
         
             
                def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
         
     | 
| 
       109 
110 
     | 
    
         
             
                def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
         
     | 
| 
       110 
111 
     | 
    
         
             
                def eval_export: (String) -> bool
         
     | 
| 
         @@ -112,7 +113,6 @@ module LLaMACpp 
     | 
|
| 
       112 
113 
     | 
    
         
             
                def n_ctx: () -> Integer
         
     | 
| 
       113 
114 
     | 
    
         
             
                def n_embd: () -> Integer
         
     | 
| 
       114 
115 
     | 
    
         
             
                def n_vocab: () -> Integer
         
     | 
| 
       115 
     | 
    
         
            -
                def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
         
     | 
| 
       116 
116 
     | 
    
         
             
                def timings: () -> ::LLaMACpp::Timings
         
     | 
| 
       117 
117 
     | 
    
         
             
                def print_timings: () -> void
         
     | 
| 
       118 
118 
     | 
    
         
             
                def reset_timings: () -> void
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: llama_cpp
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.4.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - yoshoku
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2023-08- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2023-08-26 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       13 
13 
     | 
    
         
             
            description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
         
     | 
| 
       14 
14 
     | 
    
         
             
            email:
         
     |