RubyGems - llama_cpp - Versions diffs - 0.3.4 → 0.3.6 - Mend

llama_cpp 0.3.4 → 0.3.6

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/README.md +18 -2
data/ext/llama_cpp/extconf.rb +2 -1
data/ext/llama_cpp/llama_cpp.cpp +315 -8
data/ext/llama_cpp/src/ggml-alloc.c +541 -0
data/ext/llama_cpp/src/ggml-alloc.h +22 -0
data/ext/llama_cpp/src/ggml-cuda.cu +2271 -414
data/ext/llama_cpp/src/ggml-cuda.h +1 -0
data/ext/llama_cpp/src/ggml-metal.h +7 -0
data/ext/llama_cpp/src/ggml-metal.m +218 -87
data/ext/llama_cpp/src/ggml-metal.metal +72 -55
data/ext/llama_cpp/src/ggml.c +754 -996
data/ext/llama_cpp/src/ggml.h +94 -18
data/ext/llama_cpp/src/k_quants.c +350 -24
data/ext/llama_cpp/src/llama.cpp +713 -179
data/ext/llama_cpp/src/llama.h +61 -5
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +26 -0
metadata +4 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -53,6 +53,10 @@
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
+#ifndef LLAMA_DEFAULT_RMS_EPS
+#define LLAMA_DEFAULT_RMS_EPS 5e-6f
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -83,11 +87,13 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
-        uint32_t seed;                         // RNG seed, -1 for random
-        int32_t  n_ctx;                        // text context
-        int32_t  n_batch;                      // prompt processing batch size
-        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
-        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
+        uint32_t seed;         // RNG seed, -1 for random
+        int32_t  n_ctx;        // text context
+        int32_t  n_batch;      // prompt processing batch size
+        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
+        int32_t  n_gpu_layers; // number of layers to store in VRAM
+        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@@ -102,6 +108,7 @@ extern "C" {
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool vocab_only; // only load the vocabulary, no weights
@@ -140,6 +147,40 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
     } llama_model_quantize_params;
+    // grammar types
+    struct llama_grammar;
+    // grammar element type
+    enum llama_gretype {
+        // end of rule definition
+        LLAMA_GRETYPE_END            = 0,
+        // start of alternate definition for rule
+        LLAMA_GRETYPE_ALT            = 1,
+        // non-terminal element: reference to rule
+        LLAMA_GRETYPE_RULE_REF       = 2,
+        // terminal element: character (code point)
+        LLAMA_GRETYPE_CHAR           = 3,
+        // inverse char(s) ([^a], [^a-b] [^abc])
+        LLAMA_GRETYPE_CHAR_NOT       = 4,
+        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+        // be an inclusive range ([a-z])
+        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+        // modifies a preceding LLAMA_GRETYPE_CHAR or
+        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+        LLAMA_GRETYPE_CHAR_ALT       = 6,
+    };
+    typedef struct llama_grammar_element {
+        enum llama_gretype type;
+        uint32_t           value; // Unicode code point or rule ID
+    } llama_grammar_element;
     // performance timing information
     struct llama_timings {
         double t_start_ms;
@@ -332,6 +373,15 @@ extern "C" {
     LLAMA_API llama_token llama_token_eos();  // end-of-sentence
     LLAMA_API llama_token llama_token_nl();   // next-line
+    // Grammar
+    //
+    LLAMA_API struct llama_grammar * llama_grammar_init(
+            const llama_grammar_element ** rules,
+                                 size_t    n_rules,
+                                 size_t    start_rule_index);
+    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
     // Sampling functions
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -366,6 +416,9 @@ extern "C" {
     LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
     LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -387,6 +440,9 @@ extern "C" {
     /// @details Randomly selects a token from the candidates based on their probabilities.
     LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+    /// @details Accepts the sampled token into the grammar
+    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.4'
+  VERSION = '0.3.6'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-d924522'
+  LLAMA_CPP_VERSION = 'master-468ea24'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -26,6 +26,14 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
+  LLAMA_GRETYPE_END: Integer
+  LLAMA_GRETYPE_ALT: Integer
+  LLAMA_GRETYPE_RULE_REF: Integer
+  LLAMA_GRETYPE_CHAR: Integer
+  LLAMA_GRETYPE_CHAR_NOT: Integer
+  LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
+  LLAMA_GRETYPE_CHAR_ALT: Integer
   def self?.backend_init: (?numa: bool) -> void
   def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -127,6 +135,8 @@ module LLaMACpp
     def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
     def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
     def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
+    def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
+    def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
   end
   class ContextParams
@@ -153,6 +163,8 @@ module LLaMACpp
     def rope_freq_scale: () -> Float
     def low_vram: () -> bool
     def low_vram=: (bool) -> bool
+    def mul_mat_q: () -> bool
+    def mul_mat_q=: (bool) -> bool
     def seed: () -> Integer
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
@@ -177,4 +189,18 @@ module LLaMACpp
   end
   class Params = ContextParams
+  class GrammarElement
+    public
+    def initialize: (?type: Integer, ?value: Integer) -> void
+    def type: () -> Integer
+    def type=: (Integer) -> Integer
+    def value: () -> Integer
+    def value=: (Integer) -> Integer
+  end
+  class Grammar
+    def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.4
+  version: 0.3.6
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-07-22 00:00:00.000000000 Z
+date: 2023-08-04 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -30,6 +30,8 @@ files:
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-alloc.c
+- ext/llama_cpp/src/ggml-alloc.h
 - ext/llama_cpp/src/ggml-cuda.cu
 - ext/llama_cpp/src/ggml-cuda.h
 - ext/llama_cpp/src/ggml-metal.h