RubyGems - llama_cpp - Versions diffs - 0.3.2 → 0.3.4 - Mend

llama_cpp 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +37 -0
data/ext/llama_cpp/extconf.rb +9 -0
data/ext/llama_cpp/llama_cpp.cpp +302 -112
data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
data/ext/llama_cpp/src/ggml-metal.h +5 -1
data/ext/llama_cpp/src/ggml-metal.m +65 -45
data/ext/llama_cpp/src/ggml-metal.metal +610 -484
data/ext/llama_cpp/src/ggml-mpi.c +216 -0
data/ext/llama_cpp/src/ggml-mpi.h +39 -0
data/ext/llama_cpp/src/ggml.c +1146 -812
data/ext/llama_cpp/src/ggml.h +77 -19
data/ext/llama_cpp/src/k_quants.h +8 -0
data/ext/llama_cpp/src/llama.cpp +289 -104
data/ext/llama_cpp/src/llama.h +46 -3
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +2 -1
data/sig/llama_cpp.rbs +14 -1
metadata +4 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -88,7 +88,13 @@ extern "C" {
         int32_t  n_batch;                      // prompt processing batch size
         int32_t  n_gpu_layers;                 // number of layers to store in VRAM
         int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float    rope_freq_base;  // RoPE base frequency
+        float    rope_freq_scale; // RoPE frequency scaling factor
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
         // context pointer passed to the progress callback
@@ -148,6 +154,8 @@ extern "C" {
         int32_t n_eval;
     };
+    LLAMA_API int llama_max_devices();
     LLAMA_API struct llama_context_params llama_context_default_params();
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -158,7 +166,9 @@ extern "C" {
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend(bool numa);
+    LLAMA_API void llama_backend_init(bool numa);
+    // Call once at the end of the program - currently only used for MPI
+    LLAMA_API void llama_backend_free();
     LLAMA_API int64_t llama_time_us();
@@ -268,10 +278,21 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
+    LLAMA_API int llama_tokenize_with_model(
+        const struct llama_model * model,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
     // Get the vocabulary as output parameters.
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
@@ -280,6 +301,12 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
@@ -292,7 +319,13 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+    LLAMA_API const char * llama_token_to_str_with_model(
+              const struct llama_model * model,
+                           llama_token   token);
     // Special tokens
     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
@@ -307,6 +340,16 @@ extern "C" {
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_classifier_free_guidance(
+              struct llama_context * ctx,
+            llama_token_data_array * candidates,
+              struct llama_context * guidance_ctx,
+                             float   scale);
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.3.2'
+  VERSION = '0.3.4'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-481f793'
+  LLAMA_CPP_VERSION = 'master-d924522'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -108,4 +108,5 @@ module LLaMACpp
   end
 end
-LLaMACpp.init_backend
+LLaMACpp.backend_init
+at_exit { LLaMACpp.backend_free }

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -26,7 +26,8 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
-  def self?.init_backend: (?numa: bool) -> void
+  def self?.backend_init: (?numa: bool) -> void
+  def self?.backend_free: () -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String,
     ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -38,6 +39,7 @@ module LLaMACpp
   def self?.token_nl: () -> Integer
   def self?.mmap_supported?: () -> bool
   def self?.mlock_supported?: () -> bool
+  def self?.max_devices: () -> Integer
   class TokenData
     public
@@ -68,6 +70,12 @@ module LLaMACpp
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
+    def n_vocab: () -> Integer
+    def n_ctx: () -> Integer
+    def n_embd: () -> Integer
+    def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
+    def token_to_str: (Integer) -> String
+    def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
   end
   class Timings
@@ -108,6 +116,7 @@ module LLaMACpp
     def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
     def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
+    def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
     def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
     def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -138,6 +147,10 @@ module LLaMACpp
     def main_gpu: () -> Integer
     def main_gpu=: (Integer) -> Integer
     def tensor_split: () -> Array[Float]
+    def rope_freq_base=: (Float) -> Float
+    def rope_freq_base: () -> Float
+    def rope_freq_scale=: (Float) -> Float
+    def rope_freq_scale: () -> Float
     def low_vram: () -> bool
     def low_vram=: (bool) -> bool
     def seed: () -> Integer

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.3.2
+  version: 0.3.4
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-07-08 00:00:00.000000000 Z
+date: 2023-07-22 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -35,6 +35,8 @@ files:
 - ext/llama_cpp/src/ggml-metal.h
 - ext/llama_cpp/src/ggml-metal.m
 - ext/llama_cpp/src/ggml-metal.metal
+- ext/llama_cpp/src/ggml-mpi.c
+- ext/llama_cpp/src/ggml-mpi.h
 - ext/llama_cpp/src/ggml-opencl.cpp
 - ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c