RubyGems - llama_cpp - Versions diffs - 0.14.3 → 0.14.5 - Mend

llama_cpp 0.14.3 → 0.14.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/examples/chat.rb +2 -4
data/ext/llama_cpp/extconf.rb +1 -0
data/ext/llama_cpp/llama_cpp.cpp +27 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +14 -0
data/vendor/tmp/llama.cpp/LICENSE +1 -1
data/vendor/tmp/llama.cpp/Makefile +81 -20
data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -2
data/vendor/tmp/llama.cpp/ggml-backend.c +1 -1
data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
data/vendor/tmp/llama.cpp/ggml-common.h +25 -2
data/vendor/tmp/llama.cpp/ggml-cuda.cu +295 -9324
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +133 -113
data/vendor/tmp/llama.cpp/ggml-metal.metal +344 -276
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +5 -0
data/vendor/tmp/llama.cpp/ggml-quants.c +785 -190
data/vendor/tmp/llama.cpp/ggml-quants.h +83 -80
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +963 -588
data/vendor/tmp/llama.cpp/ggml-sycl.h +13 -3
data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +37199 -14939
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +329 -308
data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -11
data/vendor/tmp/llama.cpp/ggml.c +141 -101
data/vendor/tmp/llama.cpp/ggml.h +18 -12
data/vendor/tmp/llama.cpp/llama.cpp +2519 -625
data/vendor/tmp/llama.cpp/llama.h +145 -29
data/vendor/tmp/llama.cpp/unicode-data.cpp +1651 -0
data/vendor/tmp/llama.cpp/unicode-data.h +16 -0
data/vendor/tmp/llama.cpp/unicode.cpp +8 -1403
data/vendor/tmp/llama.cpp/unicode.h +2 -0
metadata +5 -3

data/vendor/tmp/llama.cpp/llama.h CHANGED Viewed

@@ -37,9 +37,13 @@
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
+#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 4
+#define LLAMA_SESSION_VERSION 5
+#define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
+#define LLAMA_STATE_SEQ_VERSION 1
 #ifdef __cplusplus
 extern "C" {
@@ -60,9 +64,9 @@ extern "C" {
     enum llama_vocab_type {
         LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
+        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
     };
     // note: these values should be synchronized with ggml_rope
@@ -117,6 +121,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
@@ -275,13 +280,16 @@ extern "C" {
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;      // quantize to this llama_ftype
-        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor; // quantize output.weight
-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                   // quantize all tensors to the default type
-        void * imatrix;              // pointer to importance matrix data
+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;              // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;         // quantize output.weight
+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                           // quantize all tensors to the default type
+        void * imatrix;                      // pointer to importance matrix data
+        void * kv_overrides;                 // pointer to vector containing overrides
     } llama_model_quantize_params;
     // grammar types
@@ -519,6 +527,7 @@ extern "C" {
             struct llama_context * ctx);
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
@@ -590,35 +599,93 @@ extern "C" {
     // Returns the maximum size in bytes of the state (rng, logits, embedding
     // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
+    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
+        "use llama_state_get_size instead");
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
     // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(
+    LLAMA_API size_t llama_state_get_data(
             struct llama_context * ctx,
                          uint8_t * dst);
+    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
+            struct llama_context * ctx,
+                         uint8_t * dst),
+        "use llama_state_get_data instead");
     // Set the state reading from the specified address
     // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(
+    LLAMA_API size_t llama_state_set_data(
             struct llama_context * ctx,
                    const uint8_t * src);
+    LLAMA_API DEPRECATED(size_t llama_set_state_data(
+            struct llama_context * ctx,
+                   const uint8_t * src),
+        "use llama_state_set_data instead");
     // Save/load session file
-    LLAMA_API bool llama_load_session_file(
+    LLAMA_API bool llama_state_load_file(
             struct llama_context * ctx,
                       const char * path_session,
                      llama_token * tokens_out,
                           size_t   n_token_capacity,
                           size_t * n_token_count_out);
+    LLAMA_API DEPRECATED(bool llama_load_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out),
+        "use llama_state_load_file instead");
-    LLAMA_API bool llama_save_session_file(
+    LLAMA_API bool llama_state_save_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count);
+    LLAMA_API DEPRECATED(bool llama_save_session_file(
             struct llama_context * ctx,
                       const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count),
+        "use llama_state_save_file instead");
+    // Get the exact size needed to copy the KV cache of a single sequence
+    LLAMA_API size_t llama_state_seq_get_size(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+    // Copy the KV cache of a single sequence into the specified buffer
+    LLAMA_API size_t llama_state_seq_get_data(
+            struct llama_context * ctx,
+                         uint8_t * dst,
+                    llama_seq_id   seq_id);
+    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
+    // Returns:
+    //  - Positive: Ok
+    //  - Zero: Failed to load
+    LLAMA_API size_t llama_state_seq_set_data(
+            struct llama_context * ctx,
+                   const uint8_t * src,
+                    llama_seq_id   dest_seq_id);
+    LLAMA_API size_t llama_state_seq_save_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   seq_id,
                const llama_token * tokens,
                           size_t   n_token_count);
+    LLAMA_API size_t llama_state_seq_load_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   dest_seq_id,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
     //
     // Decoding
     //
@@ -674,23 +741,31 @@ extern "C" {
     LLAMA_API void llama_synchronize(struct llama_context * ctx);
     // Token logits obtained from the last call to llama_decode()
-    // The logits for the last token are stored in the last row
-    // Logits for which llama_batch.logits[i] == 0 are undefined
-    // Rows: n_tokens provided with llama_batch
+    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // Rows: number of tokens for which llama_batch.logits[i] != 0
     // Cols: n_vocab
     LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-    // Logits for the ith token. Equivalent to:
-    // llama_get_logits(ctx) + i*n_vocab
+    // Logits for the ith token. For positive indices, Equivalent to:
+    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // returns NULL for invalid ids.
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-    // Get all output token embeddings
-    // shape: [n_tokens*n_embd] (1-dimensional)
+    // Get all output token embeddings.
+    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
+    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
+    // in the order they have appeared in the batch.
+    // shape: [n_outputs*n_embd]
+    // Otherwise, returns NULL.
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-    // Get the embeddings for the ith token
-    // llama_get_embeddings(ctx) + i*n_embd
+    // Get the embeddings for the ith token. For positive indices, Equivalent to:
+    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
     // shape: [n_embd] (1-dimensional)
+    // returns NULL for invalid ids.
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
     // Get the embeddings for a sequence id
@@ -711,6 +786,8 @@ extern "C" {
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
+    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
     // Returns -1 if unknown, 1 for true or 0 for false.
@@ -733,16 +810,16 @@ extern "C" {
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
     /// @return Returns the number of tokens on success, no more than n_tokens_max
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
-    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
-    ///                Does not insert a leading space.
+    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
+    ///                      as plaintext. Does not insert a leading space.
     LLAMA_API int32_t llama_tokenize(
         const struct llama_model * model,
                       const char * text,
                          int32_t   text_len,
                      llama_token * tokens,
                          int32_t   n_tokens_max,
-                            bool   add_bos,
-                            bool   special);
+                            bool   add_special,
+                            bool   parse_special);
     // Token Id -> Piece.
     // Uses the vocabulary in the provided context.
@@ -960,6 +1037,16 @@ extern "C" {
                                 int32_t   n_past,
                                 int32_t   n_predict);
+    /// @details Build a split GGUF final path for this chunk.
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
@@ -987,10 +1074,39 @@ extern "C" {
 struct ggml_tensor;
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
     struct llama_context * ctx
 );
+void llama_grammar_accept(
+        const std::vector<std::vector<llama_grammar_element>>         & rules,
+        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
+        const uint32_t                                                  chr,
+        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
+std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        const std::string & src,
+        llama_partial_utf8   partial_start);
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H