RubyGems - llama_cpp - Versions diffs - 0.12.0 → 0.12.2 - Mend

llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -0
data/ext/llama_cpp/llama_cpp.cpp +78 -0
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +11 -0
data/vendor/tmp/llama.cpp/Makefile +7 -10
data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
data/vendor/tmp/llama.cpp/ggml.c +222 -105
data/vendor/tmp/llama.cpp/ggml.h +56 -35
data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
data/vendor/tmp/llama.cpp/llama.h +44 -8
metadata +2 -2

data/vendor/tmp/llama.cpp/llama.h CHANGED Viewed

@@ -43,7 +43,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 3
+#define LLAMA_SESSION_VERSION 4
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -103,6 +103,9 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
@@ -115,6 +118,12 @@ extern "C" {
         LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
     };
+    enum llama_split_mode {
+        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+    };
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -177,8 +186,16 @@ extern "C" {
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
-        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
-        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+        // main_gpu interpretation depends on split_mode:
+        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_LAYER: ignored
+        int32_t main_gpu;
+        // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        const float * tensor_split;
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
         // If the provided progress_callback returns true, model loading continues.
@@ -232,6 +249,7 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+        void * imatrix;              // pointer to importance matrix data
     } llama_model_quantize_params;
     // grammar types
@@ -484,6 +502,17 @@ extern "C" {
                        llama_pos   p1,
                        llama_pos   delta);
+    // Integer division of the positions by factor of `d > 1`
+    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d);
     //
     // State / sessions
     //
@@ -685,14 +714,21 @@ extern "C" {
                            float   penalty_present);
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_classifier_free_guidance(
+    /// @param logits Logits extracted from the original generation context.
+    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_apply_guidance(
+              struct llama_context * ctx,
+                             float * logits,
+                             float * logits_guidance,
+                             float   scale);
+    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
               struct llama_context * ctx,
             llama_token_data_array * candidates,
               struct llama_context * guidance_ctx,
-                             float   scale);
+                             float   scale),
+              "use llama_sample_apply_guidance() instead");
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.12.0
+  version: 0.12.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-01-11 00:00:00.000000000 Z
+date: 2024-01-20 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: