RubyGems - llama_cpp - Versions diffs - 0.7.1 → 0.9.0 - Mend

llama_cpp 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/examples/chat.rb +8 -6
data/ext/llama_cpp/extconf.rb +2 -2
data/ext/llama_cpp/llama_cpp.cpp +122 -183
data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
data/ext/llama_cpp/src/ggml-metal.m +57 -8
data/ext/llama_cpp/src/ggml-metal.metal +171 -2
data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
data/ext/llama_cpp/src/ggml.c +375 -93
data/ext/llama_cpp/src/ggml.h +11 -9
data/ext/llama_cpp/src/k_quants.c +12 -20
data/ext/llama_cpp/src/llama.cpp +459 -153
data/ext/llama_cpp/src/llama.h +34 -33
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +4 -4
data/sig/llama_cpp.rbs +15 -16
metadata +3 -3

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -133,11 +133,12 @@ extern "C" {
     typedef struct llama_batch {
         int32_t n_tokens;
-        llama_token  * token;
-        float        * embd;
-        llama_pos    * pos;
-        llama_seq_id * seq_id;
-        int8_t       * logits;
+        llama_token  *  token;
+        float        *  embd;
+        llama_pos    *  pos;
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
+        int8_t       *  logits;
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -446,7 +447,8 @@ extern "C" {
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
-    // Allocates a batch of tokens on the heap
+    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+    // Each token can be assigned up to n_seq_max sequence ids
     // The batch has to be freed with llama_batch_free()
     // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
     // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
@@ -454,7 +456,8 @@ extern "C" {
     // All members are left uninitialized
     LLAMA_API struct llama_batch llama_batch_init(
             int32_t n_tokens,
-            int32_t embd);
+            int32_t embd,
+            int32_t n_seq_max);
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
@@ -491,37 +494,41 @@ extern "C" {
     // Vocab
     //
-    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
-    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
     // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
     // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
     //
     // Tokenization
     //
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
+    /// @details Convert the provided text into tokens.
+    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+    /// @return Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+    ///                Does not insert a leading space.
     LLAMA_API int llama_tokenize(
         const struct llama_model * model,
                       const char * text,
                              int   text_len,
                      llama_token * tokens,
                              int   n_max_tokens,
-                            bool   add_bos);
+                            bool   add_bos,
+                            bool   special);
     // Token Id -> Piece.
     // Uses the vocabulary in the provided context.
@@ -554,21 +561,15 @@ extern "C" {
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                          float    penalty);
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+    LLAMA_API void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
                const llama_token * last_tokens,
-                          size_t   last_tokens_size,
-                           float   alpha_frequency,
-                           float   alpha_presence);
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present);
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.7.1'
+  VERSION = '0.9.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'b1380'
+  LLAMA_CPP_VERSION = 'b1429'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -67,9 +67,9 @@ module LLaMACpp
         # apply penalties
         last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
-        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
-        context.sample_frequency_and_presence_penalties(
-          candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
+        context.sample_repetition_penalties(
+          candidates, last_n_tokens[-last_n_repeat..],
+          penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
         )
         # temperature sampling
@@ -97,7 +97,7 @@ module LLaMACpp
       embd.each { |token| output << context.model.token_to_piece(token) }
-      break if !embd.empty? && embd[-1] == context.token_eos
+      break if !embd.empty? && embd[-1] == context.model.token_eos
     end
     output.join.scrub('?').strip.delete_prefix(prompt).strip

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -78,10 +78,20 @@ module LLaMACpp
     def n_embd: () -> Integer
     def rope_freq_scale_train: () -> Float
     def token_to_piece: (Integer) -> String
-    def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
+    def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
     def desc: () -> String
     def size: () -> Integer
     def n_params: () -> Integer
+    def text: (Integer) -> String
+    def score: (Integer) -> Float
+    def type: (Integer) -> Integer
+    def token_bos: () -> Integer
+    def token_eos: () -> Integer
+    def token_nl: () -> Integer
+    def token_prefix: () -> Integer
+    def token_middle: () -> Integer
+    def token_suffix: () -> Integer
+    def token_eot: () -> Integer
   end
   class Timings
@@ -117,7 +127,7 @@ module LLaMACpp
   class Batch
     public
-    def initialize: (n_tokens: Integer, embd: Integer) -> void
+    def initialize: (n_tokens: Integer, embd: Integer, n_seq_max: Integer) -> void
     def n_tokens=: (Integer) -> Integer
     def n_tokens: () -> Integer
     def all_pos_zero=: (Integer) -> Integer
@@ -130,8 +140,8 @@ module LLaMACpp
     def get_token: (Integer) -> Integer
     def set_pos: (Integer, Integer) -> Integer
     def get_pos: (Integer) -> Integer
-    def set_seq_id: (Integer, Integer) -> Integer
-    def get_seq_id: (Integer) -> Integer
+    def set_seq_id: (Integer, Integer, Integer) -> Integer
+    def get_seq_id: (Integer, Integer) -> Integer
     def set_logit: (Integer, bool) -> bool
     def get_logit: (Integer) -> bool
   end
@@ -143,16 +153,6 @@ module LLaMACpp
     def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
     def embeddings: () -> Array[Float]
-    def text: (Integer) -> String
-    def score: (Integer) -> Float
-    def type: (Integer) -> Integer
-    def token_bos: () -> Integer
-    def token_eos: () -> Integer
-    def token_nl: () -> Integer
-    def token_prefix: () -> Integer
-    def token_middle: () -> Integer
-    def token_suffix: () -> Integer
-    def token_eot: () -> Integer
     def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
     def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
     def decode: (::LLaMACpp::Batch) -> void
@@ -170,8 +170,7 @@ module LLaMACpp
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
-    def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
-    def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
+    def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
     def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.7.1
+  version: 0.9.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-10-13 00:00:00.000000000 Z
+date: 2023-10-28 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -78,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.19
+rubygems_version: 3.4.20
 signing_key:
 specification_version: 4
 summary: Ruby bindings for the llama.cpp.