RubyGems - llama_cpp - Versions diffs - 0.2.1 → 0.3.0 - Mend

llama_cpp 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +32 -0
data/README.md +39 -6
data/examples/README.md +32 -0
data/examples/chat.rb +2 -1
data/examples/embedding.rb +38 -0
data/ext/llama_cpp/extconf.rb +13 -0
data/ext/llama_cpp/llama_cpp.cpp +231 -132
data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
data/ext/llama_cpp/src/ggml-metal.h +4 -1
data/ext/llama_cpp/src/ggml-metal.m +193 -49
data/ext/llama_cpp/src/ggml-metal.metal +477 -84
data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
data/ext/llama_cpp/src/ggml.c +1565 -430
data/ext/llama_cpp/src/ggml.h +208 -14
data/ext/llama_cpp/src/k_quants.c +1712 -56
data/ext/llama_cpp/src/k_quants.h +41 -6
data/ext/llama_cpp/src/llama-util.h +19 -5
data/ext/llama_cpp/src/llama.cpp +194 -101
data/ext/llama_cpp/src/llama.h +41 -14
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +0 -2
data/sig/llama_cpp.rbs +12 -17
metadata +3 -3
data/lib/llama_cpp/client.rb +0 -172

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -26,6 +26,14 @@
 #    define LLAMA_API
 #endif
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
 #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
     // TODO: show sample usage
     //
+    struct llama_model;
     struct llama_context;
     typedef int llama_token;
@@ -71,28 +80,27 @@ extern "C" {
     typedef void (*llama_progress_callback)(float progress, void *ctx);
-    struct llama_context_params {
+   struct llama_context_params {
+        int seed;                              // RNG seed, -1 for random
         int n_ctx;                             // text context
         int n_batch;                           // prompt processing batch size
         int n_gpu_layers;                      // number of layers to store in VRAM
         int main_gpu;                          // the GPU that is used for scratch and small tensors
         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
-        bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
-        int seed;                              // RNG seed, -1 for random
+        // called with a progress value between 0 and 1, pass NULL to disable
+        llama_progress_callback progress_callback;
+        // context pointer passed to the progress callback
+        void * progress_callback_user_data;
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
         bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
     };
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,
@@ -132,17 +140,29 @@ extern "C" {
     // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
     LLAMA_API int64_t llama_time_us();
+    LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+            struct llama_context_params   params);
+    LLAMA_API void llama_free_model(struct llama_model * model);
+    LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params);
     // Various functions for loading a ggml llama model.
     // Allocate (almost) all memory needed for the model.
     // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
+    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
                              const char * path_model,
-            struct llama_context_params   params);
+            struct llama_context_params   params),
+            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
@@ -159,8 +179,15 @@ extern "C" {
     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
     // will be applied on top of the previous one
     // Returns 0 on success
-    LLAMA_API int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
             struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads),
+            "please use llama_model_apply_lora_from_file instead");
+    LLAMA_API int llama_model_apply_lora_from_file(
+            const struct llama_model * model,
                       const char * path_lora,
                       const char * path_base_model,
                              int   n_threads);
@@ -311,7 +338,7 @@ extern "C" {
 #include <string>
 struct ggml_tensor;
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.2.1'
+  VERSION = '0.3.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-a09f919'
+  LLAMA_CPP_VERSION = 'master-9d23589'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -2,7 +2,6 @@
 require_relative 'llama_cpp/version'
 require_relative 'llama_cpp/llama_cpp'
-require_relative 'llama_cpp/client'
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
@@ -20,7 +19,6 @@ module LLaMACpp
   # @return [String]
   def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
     raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
-    raise ArgumentError, 'context must have loaded the model' if context.empty?
     raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
     spaced_prompt = " #{prompt}"

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -25,7 +25,7 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
-  def self?.init_backend: () -> void
+  def self?.init_backend: (?numa: bool) -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
@@ -55,17 +55,24 @@ module LLaMACpp
     def sorted: () -> bool
   end
-  class Context
+  class Model
     public
     def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
                   | () -> void
-    def embeddings: () -> Array[Float]
     def empty?: () -> bool
-    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
-    def eval_export: (String) -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
+    def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
+  end
+  class Context
+    public
+    def initialize: (model: ::LLaMACpp::Model) -> void
+    def embeddings: () -> Array[Float]
+    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_export: (String) -> bool
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
@@ -75,7 +82,6 @@ module LLaMACpp
     def reset_timings: () -> void
     def token_to_str: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
-    def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
     def kv_cache_token_count: () -> Integer
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
@@ -138,15 +144,4 @@ module LLaMACpp
   end
   class Params = ContextParams
-  class Client
-    def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
-                   ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
-                   ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
-    def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
-                    ?frequency: Float, ?presence: Float,
-                    ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
-                    ?repeat_penalty: Float) -> String
-    def embeddings(String) -> Array[Float]
-  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.3.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-06-17 00:00:00.000000000 Z
+date: 2023-06-29 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -24,6 +24,7 @@ files:
 - README.md
 - examples/README.md
 - examples/chat.rb
+- examples/embedding.rb
 - ext/llama_cpp/extconf.rb
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
@@ -43,7 +44,6 @@ files:
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h
 - lib/llama_cpp.rb
-- lib/llama_cpp/client.rb
 - lib/llama_cpp/version.rb
 - sig/llama_cpp.rbs
 homepage: https://github.com/yoshoku/llama_cpp.rb

data/lib/llama_cpp/client.rb DELETED Viewed

@@ -1,172 +0,0 @@
-# frozen_string_literal: true
-module LLaMACpp
-  # Client provides a high-level interface to the LLM model.
-  class Client # rubocop:disable Metrics/ClassLength
-    # Creates a new client.
-    #
-    # @param model_path [String] The path to the model file.
-    # @param lora_adapter_path [String] The path to the LoRA adapter file.
-    # @param lora_base_path [String] The path to the LoRA base model file.
-    # @param n_ctx [Integer] The context size.
-    # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
-    # @param use_mmap [Boolean] The flag whether to use mmap.
-    # @param use_mlock [Boolean] The flag hether to use mlock.
-    # @param embedding [Boolean] The flag whether to calculate embedding.
-    # @param n_threads [Integer] The number of threads to use.
-    # @param seed [Integer] The seed for the random number generator.
-    # @return [Client]
-    # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
-    def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
-                   n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
-                   embedding: false,
-                   n_threads: 1, seed: 0)
-      @params = {
-        model_path: model_path,
-        lora_adapter_path: lora_adapter_path,
-        lora_base_path: lora_base_path,
-        n_ctx: n_ctx,
-        memory_f16: memory_f16,
-        use_mmap: use_mmap,
-        use_mlock: use_mlock,
-        embedding: embedding,
-        n_threads: n_threads,
-        seed: seed
-      }
-      @context_params = ContextParams.new
-      @context_params.n_ctx = n_ctx
-      @context_params.n_parts = n_parts
-      @context_params.f16_kv = memory_f16
-      @context_params.use_mmap = use_mmap
-      @context_params.use_mlock = use_mlock
-      @context_params.embedding = embedding
-      @context_params.seed = seed
-      @context = Context.new(model_path: model_path, params: @context_params)
-      return unless lora_adapter_path.is_a?(String)
-      if lora_base_path.is_a?(String)
-        @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
-      else
-        @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
-      end
-    end
-    # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
-    # Generates completions for a given prompt.
-    #
-    # @param prompt [String] The prompt to generate completions for.
-    # @param max_tokens [Integer] The maximum number of tokens to generate.
-    # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
-    # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
-    # @param n_batch [Integer] The batch size.
-    # @param frequency [Float] The frequency penalty value.
-    # @param presence [Float] The presence penalty value.
-    # @param top_k [Integer] The top-k value.
-    # @param top_p [Float] The top-p value.
-    # @param tfs_z [Float] The tail free sampling parameter.
-    # @param typical_p [Float] The typical probability value.
-    # @param temperature [Float] The temperature value.
-    # @param repeat_penalty [Float] The repeat penalty value.
-    # @return [String]
-    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
-    def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
-                    frequency: 0.0, presence: 0.0,
-                    top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
-      embd_input = tokenize_prompt(prompt)
-      n_ctx = @context.n_ctx
-      raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
-      last_n_tokens = [0] * n_ctx
-      embd = []
-      n_consumed = 0
-      n_past = 0
-      n_remain = max_tokens
-      n_vocab = @context.n_vocab
-      output = []
-      while n_remain != 0
-        unless embd.empty?
-          if n_past + embd.size > n_ctx
-            n_left = n_past - n_keep
-            n_past = n_keep
-            embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
-          end
-          @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
-        end
-        n_past += embd.size
-        embd.clear
-        if embd_input.size <= n_consumed
-          logits = @context.logits
-          base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
-          candidates = LLaMACpp::TokenDataArray.new(base_candidates)
-          # apply penalties
-          last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
-          @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
-          @context.sample_frequency_and_presence_penalties(
-            candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
-          )
-          # temperature sampling
-          @context.sample_top_k(candidates, k: top_k)
-          @context.sample_tail_free(candidates, z: tfs_z)
-          @context.sample_typical(candidates, prob: typical_p)
-          @context.sample_top_p(candidates, prob: top_p)
-          @context.sample_temperature(candidates, temperature: temperature)
-          id = @context.sample_token(candidates)
-          last_n_tokens.shift
-          last_n_tokens.push(id)
-          last_n_tokens.shift
-          last_n_tokens.push(id)
-          embd.push(id)
-          n_remain -= 1
-        else
-          while embd_input.size > n_consumed
-            embd.push(embd_input[n_consumed])
-            last_n_tokens.shift
-            last_n_tokens.push(embd_input[n_consumed])
-            n_consumed += 1
-            break if embd.size >= n_batch
-          end
-        end
-        embd.each { |token| output << @context.token_to_str(token) }
-        break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
-      end
-      output.join.delete_prefix(" #{prompt}").strip
-    end
-    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
-    # def chat(prompt); end
-    # Obtains the embedding for a given text.
-    #
-    # @param text [String] The text to obtain the embedding for.
-    # @return [Array<Float>]
-    def embeddings(text)
-      raise 'The embedding option is set to false' unless @params[:embedding]
-      embd_input = tokenize_prompt(text)
-      raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
-      @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
-      @context.embeddings
-    end
-    private
-    def tokenize_prompt(prompt)
-      @context.tokenize(text: " #{prompt}", add_bos: true)
-    end
-  end
-end