RubyGems - llama_cpp - Versions diffs - 0.3.3 → 0.3.5 - Mend

llama_cpp 0.3.3 → 0.3.5

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -0
data/ext/llama_cpp/extconf.rb +1 -0
data/ext/llama_cpp/llama_cpp.cpp +439 -9
data/ext/llama_cpp/src/ggml-cuda.cu +759 -136
data/ext/llama_cpp/src/ggml-metal.h +7 -0
data/ext/llama_cpp/src/ggml-metal.m +250 -111
data/ext/llama_cpp/src/ggml-metal.metal +614 -483
data/ext/llama_cpp/src/ggml.c +793 -1032
data/ext/llama_cpp/src/ggml.h +95 -18
data/ext/llama_cpp/src/k_quants.c +327 -3
data/ext/llama_cpp/src/k_quants.h +8 -0
data/ext/llama_cpp/src/llama.cpp +626 -166
data/ext/llama_cpp/src/llama.h +94 -10
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +1 -0
data/sig/llama_cpp.rbs +36 -1
metadata +2 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -53,6 +53,10 @@
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
+#ifndef LLAMA_DEFAULT_RMS_EPS
+#define LLAMA_DEFAULT_RMS_EPS 5e-6f
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -83,12 +87,20 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
-        uint32_t seed;                         // RNG seed, -1 for random
-        int32_t  n_ctx;                        // text context
-        int32_t  n_batch;                      // prompt processing batch size
-        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
-        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        uint32_t seed;         // RNG seed, -1 for random
+        int32_t  n_ctx;        // text context
+        int32_t  n_batch;      // prompt processing batch size
+        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
+        int32_t  n_gpu_layers; // number of layers to store in VRAM
+        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
         // context pointer passed to the progress callback
@@ -134,6 +146,40 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
     } llama_model_quantize_params;
+    // grammar types
+    struct llama_grammar;
+    // grammar element type
+    enum llama_gretype {
+        // end of rule definition
+        LLAMA_GRETYPE_END            = 0,
+        // start of alternate definition for rule
+        LLAMA_GRETYPE_ALT            = 1,
+        // non-terminal element: reference to rule
+        LLAMA_GRETYPE_RULE_REF       = 2,
+        // terminal element: character (code point)
+        LLAMA_GRETYPE_CHAR           = 3,
+        // inverse char(s) ([^a], [^a-b] [^abc])
+        LLAMA_GRETYPE_CHAR_NOT       = 4,
+        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+        // be an inclusive range ([a-z])
+        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+        // modifies a preceding LLAMA_GRETYPE_CHAR or
+        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+        LLAMA_GRETYPE_CHAR_ALT       = 6,
+    };
+    typedef struct llama_grammar_element {
+        enum llama_gretype type;
+        uint32_t           value; // Unicode code point or rule ID
+    } llama_grammar_element;
     // performance timing information
     struct llama_timings {
         double t_start_ms;
@@ -148,6 +194,8 @@ extern "C" {
         int32_t n_eval;
     };
+    LLAMA_API int llama_max_devices();
     LLAMA_API struct llama_context_params llama_context_default_params();
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -270,10 +318,21 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
     // Get the vocabulary as output parameters.
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
@@ -282,6 +341,12 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
@@ -294,13 +359,28 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);
     // Special tokens
     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos();  // end-of-sentence
     LLAMA_API llama_token llama_token_nl();   // next-line
+    // Grammar
+    //
+    LLAMA_API struct llama_grammar * llama_grammar_init(
+            const llama_grammar_element ** rules,
+                                 size_t    n_rules,
+                                 size_t    start_rule_index);
+    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
     // Sampling functions
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -313,13 +393,11 @@ extern "C" {
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
     /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
     /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
     LLAMA_API void llama_sample_classifier_free_guidance(
               struct llama_context * ctx,
             llama_token_data_array * candidates,
               struct llama_context * guidance_ctx,
-                             float   scale,
-                             float   smooth_factor);
+                             float   scale);
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -337,6 +415,9 @@ extern "C" {
     LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
     LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -358,6 +439,9 @@ extern "C" {
     /// @details Randomly selects a token from the candidates based on their probabilities.
     LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+    /// @details Accepts the sampled token into the grammar
+    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.3'
+  VERSION = '0.3.5'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-32c5411'
+  LLAMA_CPP_VERSION = 'master-1a94186'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -109,3 +109,4 @@ module LLaMACpp
 end
 LLaMACpp.backend_init
+at_exit { LLaMACpp.backend_free }

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -26,6 +26,14 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
+  LLAMA_GRETYPE_END: Integer
+  LLAMA_GRETYPE_ALT: Integer
+  LLAMA_GRETYPE_RULE_REF: Integer
+  LLAMA_GRETYPE_CHAR: Integer
+  LLAMA_GRETYPE_CHAR_NOT: Integer
+  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
+  LLAMA_GRETYPE_CHAR_ALT: Integer
   def self?.backend_init: (?numa: bool) -> void
   def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -39,6 +47,7 @@ module LLaMACpp
   def self?.token_nl: () -> Integer
   def self?.mmap_supported?: () -> bool
   def self?.mlock_supported?: () -> bool
+  def self?.max_devices: () -> Integer
   class TokenData
     public
@@ -69,6 +78,12 @@ module LLaMACpp
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
+    def n_vocab: () -> Integer
+    def n_ctx: () -> Integer
+    def n_embd: () -> Integer
+    def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
+    def token_to_str: (Integer) -> String
+    def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
   end
   class Timings
@@ -109,7 +124,7 @@ module LLaMACpp
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
     def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
-    def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
+    def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -120,6 +135,8 @@ module LLaMACpp
     def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
     def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
     def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
+    def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
+    def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
   end
   class ContextParams
@@ -140,6 +157,10 @@ module LLaMACpp
     def main_gpu: () -> Integer
     def main_gpu=: (Integer) -> Integer
     def tensor_split: () -> Array[Float]
+    def rope_freq_base=: (Float) -> Float
+    def rope_freq_base: () -> Float
+    def rope_freq_scale=: (Float) -> Float
+    def rope_freq_scale: () -> Float
     def low_vram: () -> bool
     def low_vram=: (bool) -> bool
     def seed: () -> Integer
@@ -166,4 +187,18 @@ module LLaMACpp
   end
   class Params = ContextParams
+  class GrammarElement
+    public
+    def initialize: (?type: Integer, ?value: Integer) -> void
+    def type: () -> Integer
+    def type=: (Integer) -> Integer
+    def value: () -> Integer
+    def value=: (Integer) -> Integer
+  end
+  class Grammar
+    def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.3
+  version: 0.3.5
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-07-15 00:00:00.000000000 Z
+date: 2023-07-29 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: