RubyGems - llama_cpp - Versions diffs - 0.3.1 → 0.3.3 - Mend

llama_cpp 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +41 -0
data/README.md +9 -0
data/examples/chat.rb +1 -1
data/examples/embedding.rb +1 -1
data/examples/prompt_jp.txt +8 -0
data/ext/llama_cpp/extconf.rb +11 -2
data/ext/llama_cpp/llama_cpp.cpp +284 -111
data/ext/llama_cpp/src/ggml-cuda.cu +639 -148
data/ext/llama_cpp/src/ggml-cuda.h +0 -4
data/ext/llama_cpp/src/ggml-metal.h +5 -1
data/ext/llama_cpp/src/ggml-metal.m +19 -6
data/ext/llama_cpp/src/ggml-metal.metal +56 -47
data/ext/llama_cpp/src/ggml-mpi.c +216 -0
data/ext/llama_cpp/src/ggml-mpi.h +39 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
data/ext/llama_cpp/src/ggml.c +1734 -2248
data/ext/llama_cpp/src/ggml.h +152 -80
data/ext/llama_cpp/src/llama.cpp +282 -90
data/ext/llama_cpp/src/llama.h +30 -1
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +16 -13
data/sig/llama_cpp.rbs +22 -2
metadata +5 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -134,6 +134,20 @@ extern "C" {
         bool quantize_output_tensor; // quantize output.weight
     } llama_model_quantize_params;
+    // performance timing information
+    struct llama_timings {
+        double t_start_ms;
+        double t_end_ms;
+        double t_load_ms;
+        double t_sample_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+        int32_t n_sample;
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
     LLAMA_API struct llama_context_params llama_context_default_params();
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -144,7 +158,9 @@ extern "C" {
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend(bool numa);
+    LLAMA_API void llama_backend_init(bool numa);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free();
     LLAMA_API int64_t llama_time_us();
@@ -293,6 +309,18 @@ extern "C" {
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+    LLAMA_API void llama_sample_classifier_free_guidance(
+              struct llama_context * ctx,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
+                             float   scale,
+                             float   smooth_factor);
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -331,6 +359,7 @@ extern "C" {
     LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
     // Performance information
+    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
     LLAMA_API void llama_reset_timings(struct llama_context * ctx);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.1'
+  VERSION = '0.3.3'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-b8c8dda'
+  LLAMA_CPP_VERSION = 'master-32c5411'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -16,8 +16,22 @@ module LLaMACpp
   # @param prompt [String] The prompt to start generation with.
   # @param n_predict [Integer] The number of tokens to predict.
   # @param n_threads [Integer] The number of threads.
+  # @param n_keep [Integer] The number of tokens to keep in the context.
+  # @param n_batch [Integer] The number of tokens to process in a batch.
+  # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
+  # @param repeat_penalty [Float] The repetition penalty.
+  # @param frequency [Float] The frequency penalty.
+  # @param presence [Float] The presence penalty.
+  # @param top_k [Integer] The number of tokens to consider for top-k sampling.
+  # @param top_p [Float] The probability threshold for nucleus sampling.
+  # @param tfs_z [Float] The z parameter for tail-free sampling.
+  # @param typical_p [Float] The probability for typical sampling.
+  # @param temperature [Float] The temperature for temperature sampling.
   # @return [String]
-  def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+  def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
+               n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
+               repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
+               top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
     raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
     raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
@@ -31,19 +45,8 @@ module LLaMACpp
     embd = []
     n_consumed = 0
-    n_keep = 10
     n_past = 0
     n_remain = n_predict
-    repeat_last_n = 64
-    repeat_penalty = 1.1
-    frequency = 0.0
-    presence = 0.0
-    top_k = 40
-    top_p = 0.95
-    tfs_z = 1.0
-    typical_p = 1.0
-    temperature = 0.8
-    n_batch = 512
     n_vocab = context.n_vocab
     output = []
@@ -105,4 +108,4 @@ module LLaMACpp
   end
 end
-LLaMACpp.init_backend
+LLaMACpp.backend_init

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -26,9 +26,13 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
-  def self?.init_backend: (?numa: bool) -> void
+  def self?.backend_init: (?numa: bool) -> void
+  def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
-  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
+  def self?.generate: (::LLaMACpp::Context, String,
+    ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
+    ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
+    ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
   def self?.token_eos: () -> Integer
@@ -67,6 +71,20 @@ module LLaMACpp
     def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
   end
+  class Timings
+    public
+    def t_start_ms: () -> Float
+    def t_end_ms: () -> Float
+    def t_load_ms: () -> Float
+    def t_sample_ms: () -> Float
+    def t_p_eval_ms: () -> Float
+    def t_eval_ms: () -> Float
+    def n_sample: () -> Integer
+    def n_p_eval: () -> Integer
+    def n_eval: () -> Integer
+  end
   class Context
     public
@@ -80,6 +98,7 @@ module LLaMACpp
     def n_embd: () -> Integer
     def n_vocab: () -> Integer
     def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
+    def timings: () -> ::LLaMACpp::Timings
     def print_timings: () -> void
     def reset_timings: () -> void
     def token_to_str: (Integer) -> String
@@ -90,6 +109,7 @@ module LLaMACpp
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
     def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
+    def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.3.3
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-07-02 00:00:00.000000000 Z
+date: 2023-07-15 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -25,6 +25,7 @@ files:
 - examples/README.md
 - examples/chat.rb
 - examples/embedding.rb
+- examples/prompt_jp.txt
 - ext/llama_cpp/extconf.rb
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
@@ -34,6 +35,8 @@ files:
 - ext/llama_cpp/src/ggml-metal.h
 - ext/llama_cpp/src/ggml-metal.m
 - ext/llama_cpp/src/ggml-metal.metal
+- ext/llama_cpp/src/ggml-mpi.c
+- ext/llama_cpp/src/ggml-mpi.h
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c