RubyGems - llama_cpp - Versions diffs - 0.13.0 → 0.14.1 - Mend

llama_cpp 0.13.0 → 0.14.1

Files changed (32) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +20 -0
data/ext/llama_cpp/llama_cpp.cpp +130 -26
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +15 -4
data/vendor/tmp/llama.cpp/Makefile +30 -15
data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +17 -5
data/vendor/tmp/llama.cpp/ggml-backend.c +371 -151
data/vendor/tmp/llama.cpp/ggml-backend.h +54 -29
data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
data/vendor/tmp/llama.cpp/ggml-cuda.cu +765 -830
data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -2
data/vendor/tmp/llama.cpp/ggml-metal.m +105 -27
data/vendor/tmp/llama.cpp/ggml-metal.metal +99 -920
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +2 -2
data/vendor/tmp/llama.cpp/ggml-quants.c +557 -1129
data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +3332 -1195
data/vendor/tmp/llama.cpp/ggml-sycl.h +5 -0
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +39336 -43461
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1302 -781
data/vendor/tmp/llama.cpp/ggml-vulkan.h +1 -0
data/vendor/tmp/llama.cpp/ggml.c +734 -356
data/vendor/tmp/llama.cpp/ggml.h +91 -51
data/vendor/tmp/llama.cpp/llama.cpp +1938 -759
data/vendor/tmp/llama.cpp/llama.h +53 -21
data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
data/vendor/tmp/llama.cpp/unicode.h +16 -774
metadata +4 -2

data/vendor/tmp/llama.cpp/llama.h CHANGED Viewed

@@ -59,9 +59,10 @@ extern "C" {
     typedef int32_t llama_seq_id;
     enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
+        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
     };
     // note: these values should be synchronized with ggml_rope
@@ -129,6 +130,7 @@ extern "C" {
     };
     enum llama_pooling_type {
+        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
         LLAMA_POOLING_TYPE_NONE = 0,
         LLAMA_POOLING_TYPE_MEAN = 1,
         LLAMA_POOLING_TYPE_CLS  = 2,
@@ -162,7 +164,7 @@ extern "C" {
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
     // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -172,7 +174,7 @@ extern "C" {
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-        int8_t       *  logits;
+        int8_t       *  logits; // TODO: rename this to "output"
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -233,10 +235,15 @@ extern "C" {
     struct llama_context_params {
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
+        uint32_t n_ubatch;          // physical maximum batch size
+        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+                                                        // (ignored if no pooling layer)
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -255,10 +262,15 @@ extern "C" {
         enum ggml_type type_v; // data type for V cache
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
+        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        // Abort callback
+        // if it returns true, execution of llama_decode() will be aborted
+        // currently works only with CPU execution
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
     };
     // model quantization parameters
@@ -268,7 +280,7 @@ extern "C" {
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+        bool pure;                   // quantize all tensors to the default type
         void * imatrix;              // pointer to importance matrix data
     } llama_model_quantize_params;
@@ -367,6 +379,8 @@ extern "C" {
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
     LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
@@ -445,7 +459,7 @@ extern "C" {
         // Maximum number of sequences that can exist in a cell. It's not an error
         // if there are more sequences in a cell than this value, however they will
         // not be visible in the view cells_sequences.
-        int32_t n_max_seq;
+        int32_t n_seq_max;
         // Number of tokens in the cache. For example, if there are two populated
         // cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -465,12 +479,12 @@ extern "C" {
         // Information for an individual cell.
         struct llama_kv_cache_view_cell * cells;
-        // The sequences for each cell. There will be n_max_seq items per cell.
+        // The sequences for each cell. There will be n_seq_max items per cell.
         llama_seq_id * cells_sequences;
     };
     // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
     // Free a KV cache view. (use only for debugging purposes)
     LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -493,7 +507,7 @@ extern "C" {
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -632,7 +646,19 @@ extern "C" {
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
-    // Token logits obtained from the last call to llama_eval()
+    // Set whether to use causal attention or not
+    // If set to true, the model will only attend to the past tokens
+    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
+    // Set abort callback
+    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Wait until all computations are finished
+    // This is automatically done when using one of the functions below to obtain the computation results
+    // and is not necessary to call it explicitly in most cases
+    LLAMA_API void llama_synchronize(struct llama_context * ctx);
+    // Token logits obtained from the last call to llama_decode()
     // The logits for the last token are stored in the last row
     // Logits for which llama_batch.logits[i] == 0 are undefined
     // Rows: n_tokens provided with llama_batch
@@ -643,14 +669,20 @@ extern "C" {
     // llama_get_logits(ctx) + i*n_vocab
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
+    // Get all output token embeddings
+    // shape: [n_tokens*n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-    // Get the embeddings for the ith sequence
+    // Get the embeddings for the ith token
     // llama_get_embeddings(ctx) + i*n_embd
+    // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+    // Get the embeddings for a sequence id
+    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
     //
     // Vocab
     //
@@ -684,7 +716,7 @@ extern "C" {
     /// @details Convert the provided text into tokens.
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    /// @return Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns the number of tokens on success, no more than n_tokens_max
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
     /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
     ///                Does not insert a leading space.
@@ -693,7 +725,7 @@ extern "C" {
                       const char * text,
                          int32_t   text_len,
                      llama_token * tokens,
-                         int32_t   n_max_tokens,
+                         int32_t   n_tokens_max,
                             bool   add_bos,
                             bool   special);