RubyGems - llama_cpp - Versions diffs - 0.1.0 → 0.1.1 - Mend

llama_cpp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/ext/llama_cpp/llama_cpp.cpp +93 -15
data/ext/llama_cpp/src/ggml-cuda.h +2 -0
data/ext/llama_cpp/src/ggml-opencl.c +85 -122
data/ext/llama_cpp/src/ggml.c +6268 -4208
data/ext/llama_cpp/src/ggml.h +205 -12
data/ext/llama_cpp/src/llama.cpp +159 -79
data/ext/llama_cpp/src/llama.h +10 -10
data/lib/llama_cpp/client.rb +1 -3
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +3 -4
metadata +2 -2

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -19,7 +19,7 @@
 #    define LLAMA_API
 #endif
-#define LLAMA_FILE_VERSION           1
+#define LLAMA_FILE_VERSION           2
 #define LLAMA_FILE_MAGIC             'ggjt'
 #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
 #define LLAMA_SESSION_MAGIC          'ggsn'
@@ -54,9 +54,9 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
     struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, -1 for random
+        int n_ctx;        // text context
+        int n_gpu_layers; // number of layers to store in VRAM
+        int seed;         // RNG seed, -1 for random
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -78,7 +78,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
         // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
@@ -134,7 +134,7 @@ extern "C" {
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
     // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
     // Set the state reading from the specified address
     // Returns the number of bytes read
@@ -202,16 +202,16 @@ extern "C" {
     LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
     /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
     LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.

data/lib/llama_cpp/client.rb CHANGED Viewed

@@ -9,7 +9,6 @@ module LLaMACpp
     # @param lora_adapter_path [String] The path to the LoRA adapter file.
     # @param lora_base_path [String] The path to the LoRA base model file.
     # @param n_ctx [Integer] The context size.
-    # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
     # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
     # @param use_mmap [Boolean] The flag whether to use mmap.
     # @param use_mlock [Boolean] The flag hether to use mlock.
@@ -19,7 +18,7 @@ module LLaMACpp
     # @return [Client]
     # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
     def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
-                   n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
+                   n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
                    embedding: false,
                    n_threads: 1, seed: 0)
       @params = {
@@ -27,7 +26,6 @@ module LLaMACpp
         lora_adapter_path: lora_adapter_path,
         lora_base_path: lora_base_path,
         n_ctx: n_ctx,
-        n_parts: n_parts,
         memory_f16: memory_f16,
         use_mmap: use_mmap,
         use_mlock: use_mlock,

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.1.0'
+  VERSION = '0.1.1'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-173d0e6'
+  LLAMA_CPP_VERSION = 'master-6986c78'
 end

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -10,7 +10,6 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q4_0: Integer
   LLAMA_FTYPE_MOSTLY_Q4_1: Integer
   LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
-  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
   LLAMA_FTYPE_MOSTLY_Q8_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_0: Integer
   LLAMA_FTYPE_MOSTLY_Q5_1: Integer
@@ -65,6 +64,8 @@ module LLaMACpp
     def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
     def kv_cache_token_count: () -> Integer
     def set_rng_seed: (Integer) -> void
+    def load_session_file: (session_path: String) -> void
+    def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
     def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
     def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
     def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
@@ -90,8 +91,6 @@ module LLaMACpp
     def logits_all=: (bool) -> bool
     def n_ctx: () -> Integer
     def n_ctx=: (Integer) -> Integer
-    def n_parts: () -> Integer
-    def n_parts=: (Integer) -> Integer
     def seed: () -> Integer
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
@@ -106,7 +105,7 @@ module LLaMACpp
   class Client
     def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
-                   ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
+                   ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
                    ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
     def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
                     ?frequency: Float, ?presence: Float,

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-05-20 00:00:00.000000000 Z
+date: 2023-05-21 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email: