RubyGems - llama_cpp - Versions diffs - 0.14.6 → 0.15.0 - Mend

llama_cpp 0.14.6 → 0.15.0

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/README.md +2 -2
data/ext/llama_cpp/extconf.rb +2 -1
data/ext/llama_cpp/llama_cpp.cpp +90 -11
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +22 -3
data/vendor/tmp/llama.cpp/Makefile +52 -22
data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
data/vendor/tmp/llama.cpp/ggml-backend.c +21 -15
data/vendor/tmp/llama.cpp/ggml-cuda.cu +6 -0
data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -0
data/vendor/tmp/llama.cpp/ggml-metal.m +376 -176
data/vendor/tmp/llama.cpp/ggml-metal.metal +654 -18
data/vendor/tmp/llama.cpp/ggml-quants.c +284 -293
data/vendor/tmp/llama.cpp/ggml-sycl.cpp +17 -7
data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5 -0
data/vendor/tmp/llama.cpp/ggml.c +394 -44
data/vendor/tmp/llama.cpp/ggml.h +22 -0
data/vendor/tmp/llama.cpp/llama.cpp +996 -455
data/vendor/tmp/llama.cpp/llama.h +46 -15
data/vendor/tmp/llama.cpp/sgemm.cpp +437 -590
data/vendor/tmp/llama.cpp/sgemm.h +4 -2
data/vendor/tmp/llama.cpp/unicode-data.cpp +1 -1
data/vendor/tmp/llama.cpp/unicode-data.h +2 -2
data/vendor/tmp/llama.cpp/unicode.cpp +448 -39
data/vendor/tmp/llama.cpp/unicode.h +2 -1
metadata +3 -3

data/vendor/tmp/llama.cpp/llama.h CHANGED Viewed

@@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 5
+#define LLAMA_SESSION_VERSION 6
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
@@ -69,6 +69,18 @@ extern "C" {
         LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
     };
+    // pre-tokenization types
+    enum llama_vocab_pre_type {
+        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
+        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
+        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
+        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+    };
     // note: these values should be synchronized with ggml_rope
     // TODO: maybe move this enum to ggml.h (ggml_rope_type)
     enum llama_rope_type {
@@ -195,15 +207,19 @@ extern "C" {
         LLAMA_KV_OVERRIDE_TYPE_INT,
         LLAMA_KV_OVERRIDE_TYPE_FLOAT,
         LLAMA_KV_OVERRIDE_TYPE_BOOL,
+        LLAMA_KV_OVERRIDE_TYPE_STR,
     };
     struct llama_model_kv_override {
-        char key[128];
         enum llama_model_kv_override_type tag;
+        char key[128];
         union {
-            int64_t int_value;
-            double float_value;
-            bool bool_value;
+            int64_t val_i64;
+            double  val_f64;
+            bool    val_bool;
+            char    val_str[128];
         };
     };
@@ -232,9 +248,10 @@ extern "C" {
         const struct llama_model_kv_override * kv_overrides;
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
+        bool vocab_only;    // only load the vocabulary, no weights
+        bool use_mmap;      // use mmap if possible
+        bool use_mlock;     // force system to keep model in RAM
+        bool check_tensors; // validate model tensor data
     };
     struct llama_context_params {
@@ -270,6 +287,7 @@ extern "C" {
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -288,6 +306,7 @@ extern "C" {
         bool quantize_output_tensor;         // quantize output.weight
         bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                           // quantize all tensors to the default type
+        bool keep_split;                     // quantize to the same number of shards
         void * imatrix;                      // pointer to importance matrix data
         void * kv_overrides;                 // pointer to vector containing overrides
     } llama_model_quantize_params;
@@ -390,8 +409,10 @@ extern "C" {
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
-    LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
+    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model   * model);
+    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model   * model);
     LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -522,7 +543,7 @@ extern "C" {
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
     LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
-    // Clear the KV cache
+    // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_cache_clear(
             struct llama_context * ctx);
@@ -783,6 +804,9 @@ extern "C" {
     LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+    LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
@@ -796,7 +820,7 @@ extern "C" {
     // Returns -1 if unknown, 1 for true or 0 for false.
     LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
-    // codellama infill tokens
+    // Codellama infill tokens
     LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
     LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
     LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@@ -825,11 +849,13 @@ extern "C" {
     // Uses the vocabulary in the provided context.
     // Does not write null terminator to the buffer.
     // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+    // @param special If true, special tokens are rendered in the output.
     LLAMA_API int32_t llama_token_to_piece(
               const struct llama_model * model,
                            llama_token   token,
                                   char * buf,
-                               int32_t   length);
+                               int32_t   length,
+                                  bool   special);
     /// Apply chat template. Inspired by hf apply_chat_template() on python.
     /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
@@ -982,7 +1008,7 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
-    /// @details Randomly selects a token from the candidates based on their probabilities.
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
     LLAMA_API llama_token llama_sample_token(
             struct llama_context * ctx,
           llama_token_data_array * candidates);
@@ -1069,8 +1095,9 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
-#include <vector>
+#include <random>
 #include <string>
+#include <vector>
 struct ggml_tensor;
@@ -1107,6 +1134,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const std::string & src,
         llama_partial_utf8   partial_start);
+// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
+// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H