RubyGems - llama_cpp - Versions diffs - 0.9.0 → 0.9.1 - Mend

llama_cpp 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/ext/llama_cpp/extconf.rb +3 -11
data/ext/llama_cpp/llama_cpp.cpp +147 -3
data/ext/llama_cpp/src/ggml-cuda.cu +288 -92
data/ext/llama_cpp/src/ggml-impl.h +237 -0
data/ext/llama_cpp/src/ggml-metal.m +58 -37
data/ext/llama_cpp/src/ggml-metal.metal +162 -34
data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
data/ext/llama_cpp/src/ggml.c +939 -3333
data/ext/llama_cpp/src/ggml.h +25 -4
data/ext/llama_cpp/src/llama.cpp +1819 -2554
data/ext/llama_cpp/src/llama.h +32 -12
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +23 -2
metadata +5 -4

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -106,6 +106,14 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
+    enum llama_rope_scaling_type {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+    };
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -172,13 +180,19 @@ extern "C" {
         uint32_t n_batch;         // prompt processing maximum batch size
         uint32_t n_threads;       // number of threads to use for generation
         uint32_t n_threads_batch; // number of threads to use for batch processing
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool embedding;  // embedding mode only
@@ -191,6 +205,7 @@ extern "C" {
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
     } llama_model_quantize_params;
     // grammar types
@@ -333,17 +348,14 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
-    // Remove all tokens data of cells in [c0, c1)
-    // c0 < 0 : [0,  c1]
-    // c1 < 0 : [c0, inf)
-    LLAMA_API void llama_kv_cache_tokens_rm(
-            struct llama_context * ctx,
-                         int32_t   c0,
-                         int32_t   c1);
+    // Clear the KV cache
+    LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx);
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
@@ -600,6 +612,13 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
+    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    LLAMA_API void llama_sample_min_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
     LLAMA_API void llama_sample_tail_free(
             struct llama_context * ctx,
@@ -658,6 +677,7 @@ extern "C" {
                            float * mu);
     /// @details Selects the token with the highest probability.
+    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
     LLAMA_API llama_token llama_sample_token_greedy(
             struct llama_context * ctx,
           llama_token_data_array * candidates);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.9.0'
+  VERSION = '0.9.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1429'
+  LLAMA_CPP_VERSION = 'b1472'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -31,6 +31,12 @@ module LLaMACpp
   LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
   LLAMA_GRETYPE_CHAR_ALT: Integer
+  LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
+  LLAMA_ROPE_SCALING_NONE: Integer
+  LLAMA_ROPE_SCALING_LINEAR: Integer
+  LLAMA_ROPE_SCALING_YARN: Integer
+  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
   def self?.backend_init: (?numa: bool) -> void
   def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -162,11 +168,11 @@ module LLaMACpp
     def print_timings: () -> void
     def reset_timings: () -> void
     def kv_cache_token_count: () -> Integer
-    def kv_cache_tokens_rm: (Integer, Integer) -> void
+    def kv_cache_clear: () -> void
     def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
     def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
     def kv_cache_seq_keep: (Integer) -> void
-    def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
+    def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
@@ -175,6 +181,7 @@ module LLaMACpp
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
+    def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
     def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
@@ -200,10 +207,22 @@ module LLaMACpp
     def n_threads=: (Integer) -> Integer
     def n_threads_batch: () -> Integer
     def n_threads_batch=: (Integer) -> Integer
+    def rope_scaling_type=: (Integer) -> Integer
+    def rope_scaling_type: () -> Integer
     def rope_freq_base=: (Float) -> Float
     def rope_freq_base: () -> Float
     def rope_freq_scale=: (Float) -> Float
     def rope_freq_scale: () -> Float
+    def yarn_ext_factor=: (Float) -> Float
+    def yarn_ext_factor: () -> Float
+    def yarn_attn_factor=: (Float) -> Float
+    def yarn_attn_factor: () -> Float
+    def yarn_beta_fast=: (Float) -> Float
+    def yarn_beta_fast: () -> Float
+    def yarn_beta_slow=: (Float) -> Float
+    def yarn_beta_slow: () -> Float
+    def yarn_orig_ctx=: (Integer) -> Integer
+    def yarn_orig_ctx: () -> Integer
     def mul_mat_q: () -> bool
     def mul_mat_q=: (bool) -> bool
     def f16_kv: () -> bool
@@ -227,6 +246,8 @@ module LLaMACpp
     def quantize_output_tensor=: (bool) -> bool
     def only_copy: () -> bool
     def only_copy=: (bool) -> bool
+    def pure: () -> bool
+    def pure=: (bool) -> bool
   end
   class Params = ContextParams

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 0.9.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-28 00:00:00.000000000 Z
+date: 2023-11-03 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -36,6 +36,7 @@ files:
 - ext/llama_cpp/src/ggml-backend.h
 - ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-impl.h
 - ext/llama_cpp/src/ggml-metal.h
 - ext/llama_cpp/src/ggml-metal.m
 - ext/llama_cpp/src/ggml-metal.metal
@@ -43,10 +44,10 @@ files:
 - ext/llama_cpp/src/ggml-mpi.h
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
+- ext/llama_cpp/src/ggml-quants.c
+- ext/llama_cpp/src/ggml-quants.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
-- ext/llama_cpp/src/k_quants.c
-- ext/llama_cpp/src/k_quants.h
 - ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h