RubyGems - llama_cpp - Versions diffs - 0.9.5 → 0.10.0 - Mend

llama_cpp 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/ext/llama_cpp/llama_cpp.cpp +121 -15
data/ext/llama_cpp/src/ggml-alloc.c +42 -7
data/ext/llama_cpp/src/ggml-alloc.h +7 -0
data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
data/ext/llama_cpp/src/ggml-backend.c +563 -156
data/ext/llama_cpp/src/ggml-backend.h +62 -17
data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
data/ext/llama_cpp/src/ggml-cuda.h +9 -1
data/ext/llama_cpp/src/ggml-impl.h +1 -1
data/ext/llama_cpp/src/ggml-metal.h +6 -0
data/ext/llama_cpp/src/ggml-metal.m +506 -158
data/ext/llama_cpp/src/ggml-metal.metal +795 -144
data/ext/llama_cpp/src/ggml.c +331 -111
data/ext/llama_cpp/src/ggml.h +49 -4
data/ext/llama_cpp/src/llama.cpp +749 -329
data/ext/llama_cpp/src/llama.h +28 -5
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +20 -2
metadata +2 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -158,6 +158,22 @@ extern "C" {
         llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@@ -165,9 +181,13 @@ extern "C" {
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
         // context pointer passed to the progress callback
         void * progress_callback_user_data;
+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible
@@ -191,11 +211,14 @@ extern "C" {
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
     };
     // model quantization parameters

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.9.5'
+  VERSION = '0.10.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1593'
+  LLAMA_CPP_VERSION = 'b1620'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -23,6 +23,10 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
+  LLAMA_KV_OVERRIDE_INT: Integer
+  LLAMA_KV_OVERRIDE_FLOAT: Integer
+  LLAMA_KV_OVERRIDE_BOOL: Integer
   LLAMA_GRETYPE_END: Integer
   LLAMA_GRETYPE_ALT: Integer
   LLAMA_GRETYPE_RULE_REF: Integer
@@ -116,6 +120,16 @@ module LLaMACpp
     def n_eval: () -> Integer
   end
+  class ModelKVOverride
+    public
+    def key: () -> String
+    def tag: () -> Integer
+    def int_value: () -> Integer
+    def float_value: () -> Float
+    def bool_value: () -> bool
+  end
   class ModelParams
     public
@@ -225,14 +239,18 @@ module LLaMACpp
     def yarn_beta_slow: () -> Float
     def yarn_orig_ctx=: (Integer) -> Integer
     def yarn_orig_ctx: () -> Integer
+    def type_k=: (Integer) -> Integer
+    def type_k: () -> Integer
+    def type_v=: (Integer) -> Integer
+    def type_v: () -> Integer
     def mul_mat_q: () -> bool
     def mul_mat_q=: (bool) -> bool
-    def f16_kv: () -> bool
-    def f16_kv=: (bool) -> bool
     def logits_all: () -> bool
     def logits_all=: (bool) -> bool
     def embedding: () -> bool
     def embedding=: (bool) -> bool
+    def offload_kqv: () -> bool
+    def offload_kqv=: (bool) -> bool
   end
   class ModelQuantizeParams

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.9.5
+  version: 0.10.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-12-02 00:00:00.000000000 Z
+date: 2023-12-09 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: