RubyGems - llama_cpp - Versions diffs - 0.8.0 → 0.9.1 - Mend

llama_cpp 0.8.0 → 0.9.1

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +19 -0
data/examples/chat.rb +8 -6
data/ext/llama_cpp/extconf.rb +3 -11
data/ext/llama_cpp/llama_cpp.cpp +228 -165
data/ext/llama_cpp/src/ggml-cuda.cu +441 -77
data/ext/llama_cpp/src/ggml-impl.h +237 -0
data/ext/llama_cpp/src/ggml-metal.m +71 -42
data/ext/llama_cpp/src/ggml-metal.metal +171 -35
data/ext/llama_cpp/src/ggml-opencl.cpp +161 -169
data/ext/llama_cpp/src/{k_quants.c → ggml-quants.c} +3329 -1099
data/ext/llama_cpp/src/{k_quants.h → ggml-quants.h} +81 -22
data/ext/llama_cpp/src/ggml.c +1303 -3419
data/ext/llama_cpp/src/ggml.h +33 -11
data/ext/llama_cpp/src/llama.cpp +1925 -2655
data/ext/llama_cpp/src/llama.h +48 -33
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +4 -4
data/sig/llama_cpp.rbs +34 -14
metadata +5 -4

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -106,6 +106,14 @@ extern "C" {
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
+    enum llama_rope_scaling_type {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+    };
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -172,13 +180,19 @@ extern "C" {
         uint32_t n_batch;         // prompt processing maximum batch size
         uint32_t n_threads;       // number of threads to use for generation
         uint32_t n_threads_batch; // number of threads to use for batch processing
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size
         // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool embedding;  // embedding mode only
@@ -191,6 +205,7 @@ extern "C" {
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
     } llama_model_quantize_params;
     // grammar types
@@ -333,17 +348,14 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
-    // Remove all tokens data of cells in [c0, c1)
-    // c0 < 0 : [0,  c1]
-    // c1 < 0 : [c0, inf)
-    LLAMA_API void llama_kv_cache_tokens_rm(
-            struct llama_context * ctx,
-                         int32_t   c0,
-                         int32_t   c1);
+    // Clear the KV cache
+    LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx);
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
@@ -494,21 +506,22 @@ extern "C" {
     // Vocab
     //
-    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
     // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
     // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
     //
     // Tokenization
@@ -560,21 +573,15 @@ extern "C" {
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                          float    penalty);
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+    LLAMA_API void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
                const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                           float   alpha_frequency,
-                           float   alpha_presence);
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present);
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -605,6 +612,13 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
+    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    LLAMA_API void llama_sample_min_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
     LLAMA_API void llama_sample_tail_free(
             struct llama_context * ctx,
@@ -663,6 +677,7 @@ extern "C" {
                            float * mu);
     /// @details Selects the token with the highest probability.
+    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
     LLAMA_API llama_token llama_sample_token_greedy(
             struct llama_context * ctx,
           llama_token_data_array * candidates);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.8.0'
+  VERSION = '0.9.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1405'
+  LLAMA_CPP_VERSION = 'b1472'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -67,9 +67,9 @@ module LLaMACpp
         # apply penalties
         last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
-        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
-        context.sample_frequency_and_presence_penalties(
-          candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
+        context.sample_repetition_penalties(
+          candidates, last_n_tokens[-last_n_repeat..],
+          penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
         )
         # temperature sampling
@@ -97,7 +97,7 @@ module LLaMACpp
       embd.each { |token| output << context.model.token_to_piece(token) }
-      break if !embd.empty? && embd[-1] == context.token_eos
+      break if !embd.empty? && embd[-1] == context.model.token_eos
     end
     output.join.scrub('?').strip.delete_prefix(prompt).strip

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -31,6 +31,12 @@ module LLaMACpp
   LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
   LLAMA_GRETYPE_CHAR_ALT: Integer
+  LLAMA_ROPE_SCALING_UNSPECIFIED: Integer
+  LLAMA_ROPE_SCALING_NONE: Integer
+  LLAMA_ROPE_SCALING_LINEAR: Integer
+  LLAMA_ROPE_SCALING_YARN: Integer
+  LLAMA_ROPE_SCALING_MAX_VALUE: Integer
   def self?.backend_init: (?numa: bool) -> void
   def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -82,6 +88,16 @@ module LLaMACpp
     def desc: () -> String
     def size: () -> Integer
     def n_params: () -> Integer
+    def text: (Integer) -> String
+    def score: (Integer) -> Float
+    def type: (Integer) -> Integer
+    def token_bos: () -> Integer
+    def token_eos: () -> Integer
+    def token_nl: () -> Integer
+    def token_prefix: () -> Integer
+    def token_middle: () -> Integer
+    def token_suffix: () -> Integer
+    def token_eot: () -> Integer
   end
   class Timings
@@ -143,16 +159,6 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
     def embeddings: () -> Array[Float]
-    def text: (Integer) -> String
-    def score: (Integer) -> Float
-    def type: (Integer) -> Integer
-    def token_bos: () -> Integer
-    def token_eos: () -> Integer
-    def token_nl: () -> Integer
-    def token_prefix: () -> Integer
-    def token_middle: () -> Integer
-    def token_suffix: () -> Integer
-    def token_eot: () -> Integer
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
     def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
     def decode: (::LLaMACpp::Batch) -> void
@@ -162,20 +168,20 @@ module LLaMACpp
     def print_timings: () -> void
     def reset_timings: () -> void
     def kv_cache_token_count: () -> Integer
-    def kv_cache_tokens_rm: (Integer, Integer) -> void
+    def kv_cache_clear: () -> void
     def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
     def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
     def kv_cache_seq_keep: (Integer) -> void
-    def kv_cache_seq_shift: (Integer, Integer, Ingteger, Integer) -> void
+    def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
-    def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
-    def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
+    def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
     def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
+    def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
     def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
     def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
@@ -201,10 +207,22 @@ module LLaMACpp
     def n_threads=: (Integer) -> Integer
     def n_threads_batch: () -> Integer
     def n_threads_batch=: (Integer) -> Integer
+    def rope_scaling_type=: (Integer) -> Integer
+    def rope_scaling_type: () -> Integer
     def rope_freq_base=: (Float) -> Float
     def rope_freq_base: () -> Float
     def rope_freq_scale=: (Float) -> Float
     def rope_freq_scale: () -> Float
+    def yarn_ext_factor=: (Float) -> Float
+    def yarn_ext_factor: () -> Float
+    def yarn_attn_factor=: (Float) -> Float
+    def yarn_attn_factor: () -> Float
+    def yarn_beta_fast=: (Float) -> Float
+    def yarn_beta_fast: () -> Float
+    def yarn_beta_slow=: (Float) -> Float
+    def yarn_beta_slow: () -> Float
+    def yarn_orig_ctx=: (Integer) -> Integer
+    def yarn_orig_ctx: () -> Integer
     def mul_mat_q: () -> bool
     def mul_mat_q=: (bool) -> bool
     def f16_kv: () -> bool
@@ -228,6 +246,8 @@ module LLaMACpp
     def quantize_output_tensor=: (bool) -> bool
     def only_copy: () -> bool
     def only_copy=: (bool) -> bool
+    def pure: () -> bool
+    def pure=: (bool) -> bool
   end
   class Params = ContextParams

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.8.0
+  version: 0.9.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-21 00:00:00.000000000 Z
+date: 2023-11-03 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -36,6 +36,7 @@ files:
 - ext/llama_cpp/src/ggml-backend.h
 - ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-impl.h
 - ext/llama_cpp/src/ggml-metal.h
 - ext/llama_cpp/src/ggml-metal.m
 - ext/llama_cpp/src/ggml-metal.metal
@@ -43,10 +44,10 @@ files:
 - ext/llama_cpp/src/ggml-mpi.h
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
+- ext/llama_cpp/src/ggml-quants.c
+- ext/llama_cpp/src/ggml-quants.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
-- ext/llama_cpp/src/k_quants.c
-- ext/llama_cpp/src/k_quants.h
 - ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h