RubyGems - llama_cpp - Versions diffs - 0.0.7 → 0.1.0 - Mend

llama_cpp 0.0.7 → 0.1.0

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -0
data/ext/llama_cpp/llama_cpp.cpp +736 -36
data/ext/llama_cpp/src/ggml-cuda.h +8 -33
data/ext/llama_cpp/src/ggml-opencl.c +202 -20
data/ext/llama_cpp/src/ggml.c +732 -496
data/ext/llama_cpp/src/ggml.h +47 -5
data/ext/llama_cpp/src/{llama_util.h → llama-util.h} +76 -10
data/ext/llama_cpp/src/llama.cpp +560 -147
data/ext/llama_cpp/src/llama.h +71 -24
data/lib/llama_cpp/client.rb +29 -6
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +27 -3
data/sig/llama_cpp.rbs +38 -3
metadata +3 -3

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -19,9 +19,11 @@
 #    define LLAMA_API
 #endif
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
+#define LLAMA_FILE_VERSION           1
+#define LLAMA_FILE_MAGIC             'ggjt'
+#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
+#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_SESSION_VERSION        1
 #ifdef __cplusplus
 extern "C" {
@@ -39,18 +41,22 @@ extern "C" {
     typedef struct llama_token_data {
         llama_token id;  // token id
+        float logit; // log-odds of the token
         float p;     // probability of the token
-        float plog;  // log probability of the token
     } llama_token_data;
+    typedef struct llama_token_data_array {
+        llama_token_data * data;
+        size_t size;
+        bool sorted;
+    } llama_token_data_array;
     typedef void (*llama_progress_callback)(float progress, void *ctx);
     struct llama_context_params {
         int n_ctx;   // text context
         int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
+        int seed;    // RNG seed, -1 for random
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,7 +79,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
@@ -116,13 +122,14 @@ extern "C" {
                              int   n_threads);
     // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+    // Returns the maximum size in bytes of the state (rng, logits, embedding
+    // and kv_cache) - will often be smaller after compacting tokens
+    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
@@ -133,6 +140,10 @@ extern "C" {
     // Returns the number of bytes read
     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+    // Save/load session file
+    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls
@@ -156,9 +167,9 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
@@ -172,21 +183,57 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
     // Special tokens
     LLAMA_API llama_token llama_token_bos();
     LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_nl();
+    // Sampling functions
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+    /// @details Selects the token with the highest probability.
+    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
+    /// @details Randomly selects a token from the candidates based on their probabilities.
+    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
     // Performance information
     LLAMA_API void llama_print_timings(struct llama_context * ctx);

data/lib/llama_cpp/client.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 module LLaMACpp
   # Client provides a high-level interface to the LLM model.
-  class Client
+  class Client # rubocop:disable Metrics/ClassLength
     # Creates a new client.
     #
     # @param model_path [String] The path to the model file.
@@ -61,14 +61,19 @@ module LLaMACpp
     # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
     # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
     # @param n_batch [Integer] The batch size.
+    # @param frequency [Float] The frequency penalty value.
+    # @param presence [Float] The presence penalty value.
     # @param top_k [Integer] The top-k value.
     # @param top_p [Float] The top-p value.
+    # @param tfs_z [Float] The tail free sampling parameter.
+    # @param typical_p [Float] The typical probability value.
     # @param temperature [Float] The temperature value.
     # @param repeat_penalty [Float] The repeat penalty value.
     # @return [String]
     # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
     def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
-                    top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
+                    frequency: 0.0, presence: 0.0,
+                    top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
       embd_input = tokenize_prompt(prompt)
       n_ctx = @context.n_ctx
@@ -80,6 +85,7 @@ module LLaMACpp
       n_consumed = 0
       n_past = 0
       n_remain = max_tokens
+      n_vocab = @context.n_vocab
       output = []
       while n_remain != 0
@@ -97,11 +103,28 @@ module LLaMACpp
         embd.clear
         if embd_input.size <= n_consumed
-          start = n_ctx - repeat_last_n
-          id = @context.sample_top_p_top_k(
-            last_n_tokens[start...(start + repeat_last_n)],
-            top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
+          logits = @context.logits
+          base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
+          candidates = LLaMACpp::TokenDataArray.new(base_candidates)
+          # apply penalties
+          last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
+          @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
+          @context.sample_frequency_and_presence_penalties(
+            candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
           )
+          # temperature sampling
+          @context.sample_top_k(candidates, k: top_k)
+          @context.sample_tail_free(candidates, z: tfs_z)
+          @context.sample_typical(candidates, prob: typical_p)
+          @context.sample_top_p(candidates, prob: top_p)
+          @context.sample_temperature(candidates, temperature: temperature)
+          id = @context.sample_token(candidates)
+          last_n_tokens.shift
+          last_n_tokens.push(id)
           last_n_tokens.shift
           last_n_tokens.push(id)

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.0.7'
+  VERSION = '0.1.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-11d9023'
+  LLAMA_CPP_VERSION = 'master-173d0e6'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -37,7 +37,16 @@ module LLaMACpp
     n_past = 0
     n_remain = n_predict
     repeat_last_n = 64
+    repeat_penalty = 1.1
+    frequency = 0.0
+    presence = 0.0
+    top_k = 40
+    top_p = 0.95
+    tfs_z = 1.0
+    typical_p = 1.0
+    temperature = 0.8
     n_batch = 512
+    n_vocab = context.n_vocab
     output = []
     while n_remain != 0
@@ -55,10 +64,25 @@ module LLaMACpp
       embd.clear
       if embd_input.size <= n_consumed
-        start = n_ctx - repeat_last_n
-        id = context.sample_top_p_top_k(
-          last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
+        logits = context.logits
+        base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
+        candidates = LLaMACpp::TokenDataArray.new(base_candidates)
+        # apply penalties
+        last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
+        context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
+        context.sample_frequency_and_presence_penalties(
+          candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
         )
+        # temperature sampling
+        context.sample_top_k(candidates, k: top_k)
+        context.sample_tail_free(candidates, z: tfs_z)
+        context.sample_typical(candidates, prob: typical_p)
+        context.sample_top_p(candidates, prob: top_p)
+        context.sample_temperature(candidates, temperature: temperature)
+        id = context.sample_token(candidates)
         last_n_tokens.shift
         last_n_tokens.push(id)

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -11,7 +11,6 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q4_1: Integer
   LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
   LLAMA_FTYPE_MOSTLY_Q4_2: Integer
-  LLAMA_FTYPE_MOSTLY_Q4_3: Integer
   LLAMA_FTYPE_MOSTLY_Q8_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_1: Integer
@@ -21,9 +20,30 @@ module LLaMACpp
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
   def self?.token_eos: () -> Integer
+  def self?.token_nl: () -> Integer
   def self?.mmap_supported?: () -> bool
   def self?.mlock_supported?: () -> bool
+  class TokenData
+    public
+    def initialize: (id: Integer, logit: Float, p: Float) -> void
+    def id: () -> Integer
+    def id=: (Integer) -> Integer
+    def logit: () -> Float
+    def logit=: (Float) -> Float
+    def p: () -> Float
+    def p=: (Float) -> Float
+  end
+  class TokenDataArray
+    public
+    def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
+    def size: () -> Integer
+    def sorted: () -> bool
+  end
   class Context
     public
@@ -40,10 +60,23 @@ module LLaMACpp
     def n_vocab: () -> Integer
     def print_timings: () -> void
     def reset_timings: () -> void
-    def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
     def token_to_str: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
     def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
+    def kv_cache_token_count: () -> Integer
+    def set_rng_seed: (Integer) -> void
+    def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
+    def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
+    def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
+    def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
+    def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
+    def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
+    def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
+    def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
+    def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
+    def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
+    def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
+    def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
   end
   class ContextParams
@@ -76,7 +109,9 @@ module LLaMACpp
                    ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
                    ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
     def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
-                    ?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
+                    ?frequency: Float, ?presence: Float,
+                    ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
+                    ?repeat_penalty: Float) -> String
     def embeddings(String) -> Array[Float]
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.1.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-04-29 00:00:00.000000000 Z
+date: 2023-05-20 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -31,9 +31,9 @@ files:
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
+- ext/llama_cpp/src/llama-util.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h
-- ext/llama_cpp/src/llama_util.h
 - lib/llama_cpp.rb
 - lib/llama_cpp/client.rb
 - lib/llama_cpp/version.rb