RubyGems - llama_cpp - Versions diffs - 0.2.2 → 0.3.0 - Mend

llama_cpp 0.2.2 → 0.3.0

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +28 -0
data/README.md +39 -6
data/examples/chat.rb +2 -1
data/examples/embedding.rb +3 -2
data/ext/llama_cpp/extconf.rb +13 -0
data/ext/llama_cpp/llama_cpp.cpp +231 -132
data/ext/llama_cpp/src/ggml-cuda.cu +319 -52
data/ext/llama_cpp/src/ggml-metal.m +36 -30
data/ext/llama_cpp/src/ggml-metal.metal +328 -84
data/ext/llama_cpp/src/ggml.c +800 -303
data/ext/llama_cpp/src/ggml.h +68 -5
data/ext/llama_cpp/src/k_quants.c +1712 -56
data/ext/llama_cpp/src/k_quants.h +41 -6
data/ext/llama_cpp/src/llama-util.h +19 -5
data/ext/llama_cpp/src/llama.cpp +138 -72
data/ext/llama_cpp/src/llama.h +33 -5
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +0 -2
data/sig/llama_cpp.rbs +12 -17
metadata +2 -3
data/lib/llama_cpp/client.rb +0 -172

data/ext/llama_cpp/src/llama.h CHANGED Viewed

@@ -26,6 +26,14 @@
 #    define LLAMA_API
 #endif
+#ifdef __GNUC__
+#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define DEPRECATED(func, hint) func
+#endif
 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
 #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
     // TODO: show sample usage
     //
+    struct llama_model;
     struct llama_context;
     typedef int llama_token;
@@ -131,17 +140,29 @@ extern "C" {
     // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
     LLAMA_API int64_t llama_time_us();
+    LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+            struct llama_context_params   params);
+    LLAMA_API void llama_free_model(struct llama_model * model);
+    LLAMA_API struct llama_context * llama_new_context_with_model(
+                     struct llama_model * model,
+            struct llama_context_params   params);
     // Various functions for loading a ggml llama model.
     // Allocate (almost) all memory needed for the model.
     // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
+    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
                              const char * path_model,
-            struct llama_context_params   params);
+            struct llama_context_params   params),
+            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
@@ -158,8 +179,15 @@ extern "C" {
     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
     // will be applied on top of the previous one
     // Returns 0 on success
-    LLAMA_API int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
             struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads),
+            "please use llama_model_apply_lora_from_file instead");
+    LLAMA_API int llama_model_apply_lora_from_file(
+            const struct llama_model * model,
                       const char * path_lora,
                       const char * path_base_model,
                              int   n_threads);
@@ -310,7 +338,7 @@ extern "C" {
 #include <string>
 struct ggml_tensor;
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.2.2'
+  VERSION = '0.3.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-7487137'
+  LLAMA_CPP_VERSION = 'master-9d23589'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -2,7 +2,6 @@
 require_relative 'llama_cpp/version'
 require_relative 'llama_cpp/llama_cpp'
-require_relative 'llama_cpp/client'
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
@@ -20,7 +19,6 @@ module LLaMACpp
   # @return [String]
   def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
     raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
-    raise ArgumentError, 'context must have loaded the model' if context.empty?
     raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
     spaced_prompt = " #{prompt}"

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -25,7 +25,7 @@ module LLaMACpp
   LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
   LLAMA_FTYPE_MOSTLY_Q6_K: Integer
-  def self?.init_backend: () -> void
+  def self?.init_backend: (?numa: bool) -> void
   def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
   def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
@@ -55,17 +55,24 @@ module LLaMACpp
     def sorted: () -> bool
   end
-  class Context
+  class Model
     public
     def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
                   | () -> void
-    def embeddings: () -> Array[Float]
     def empty?: () -> bool
-    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
-    def eval_export: (String) -> bool
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
+    def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
+  end
+  class Context
+    public
+    def initialize: (model: ::LLaMACpp::Model) -> void
+    def embeddings: () -> Array[Float]
+    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
+    def eval_export: (String) -> bool
     def logits: () -> Array[Float]
     def n_ctx: () -> Integer
     def n_embd: () -> Integer
@@ -75,7 +82,6 @@ module LLaMACpp
     def reset_timings: () -> void
     def token_to_str: (Integer) -> String
     def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
-    def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
     def kv_cache_token_count: () -> Integer
     def set_rng_seed: (Integer) -> void
     def load_session_file: (session_path: String) -> void
@@ -138,15 +144,4 @@ module LLaMACpp
   end
   class Params = ContextParams
-  class Client
-    def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
-                   ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
-                   ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
-    def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
-                    ?frequency: Float, ?presence: Float,
-                    ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
-                    ?repeat_penalty: Float) -> String
-    def embeddings(String) -> Array[Float]
-  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.3.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-06-23 00:00:00.000000000 Z
+date: 2023-06-29 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -44,7 +44,6 @@ files:
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h
 - lib/llama_cpp.rb
-- lib/llama_cpp/client.rb
 - lib/llama_cpp/version.rb
 - sig/llama_cpp.rbs
 homepage: https://github.com/yoshoku/llama_cpp.rb

data/lib/llama_cpp/client.rb DELETED Viewed

@@ -1,172 +0,0 @@
-# frozen_string_literal: true
-module LLaMACpp
-  # Client provides a high-level interface to the LLM model.
-  class Client # rubocop:disable Metrics/ClassLength
-    # Creates a new client.
-    #
-    # @param model_path [String] The path to the model file.
-    # @param lora_adapter_path [String] The path to the LoRA adapter file.
-    # @param lora_base_path [String] The path to the LoRA base model file.
-    # @param n_ctx [Integer] The context size.
-    # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
-    # @param use_mmap [Boolean] The flag whether to use mmap.
-    # @param use_mlock [Boolean] The flag hether to use mlock.
-    # @param embedding [Boolean] The flag whether to calculate embedding.
-    # @param n_threads [Integer] The number of threads to use.
-    # @param seed [Integer] The seed for the random number generator.
-    # @return [Client]
-    # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
-    def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
-                   n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
-                   embedding: false,
-                   n_threads: 1, seed: 0)
-      @params = {
-        model_path: model_path,
-        lora_adapter_path: lora_adapter_path,
-        lora_base_path: lora_base_path,
-        n_ctx: n_ctx,
-        memory_f16: memory_f16,
-        use_mmap: use_mmap,
-        use_mlock: use_mlock,
-        embedding: embedding,
-        n_threads: n_threads,
-        seed: seed
-      }
-      @context_params = ContextParams.new
-      @context_params.n_ctx = n_ctx
-      @context_params.n_parts = n_parts
-      @context_params.f16_kv = memory_f16
-      @context_params.use_mmap = use_mmap
-      @context_params.use_mlock = use_mlock
-      @context_params.embedding = embedding
-      @context_params.seed = seed
-      @context = Context.new(model_path: model_path, params: @context_params)
-      return unless lora_adapter_path.is_a?(String)
-      if lora_base_path.is_a?(String)
-        @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
-      else
-        @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
-      end
-    end
-    # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
-    # Generates completions for a given prompt.
-    #
-    # @param prompt [String] The prompt to generate completions for.
-    # @param max_tokens [Integer] The maximum number of tokens to generate.
-    # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
-    # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
-    # @param n_batch [Integer] The batch size.
-    # @param frequency [Float] The frequency penalty value.
-    # @param presence [Float] The presence penalty value.
-    # @param top_k [Integer] The top-k value.
-    # @param top_p [Float] The top-p value.
-    # @param tfs_z [Float] The tail free sampling parameter.
-    # @param typical_p [Float] The typical probability value.
-    # @param temperature [Float] The temperature value.
-    # @param repeat_penalty [Float] The repeat penalty value.
-    # @return [String]
-    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
-    def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
-                    frequency: 0.0, presence: 0.0,
-                    top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
-      embd_input = tokenize_prompt(prompt)
-      n_ctx = @context.n_ctx
-      raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
-      last_n_tokens = [0] * n_ctx
-      embd = []
-      n_consumed = 0
-      n_past = 0
-      n_remain = max_tokens
-      n_vocab = @context.n_vocab
-      output = []
-      while n_remain != 0
-        unless embd.empty?
-          if n_past + embd.size > n_ctx
-            n_left = n_past - n_keep
-            n_past = n_keep
-            embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
-          end
-          @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
-        end
-        n_past += embd.size
-        embd.clear
-        if embd_input.size <= n_consumed
-          logits = @context.logits
-          base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
-          candidates = LLaMACpp::TokenDataArray.new(base_candidates)
-          # apply penalties
-          last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
-          @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
-          @context.sample_frequency_and_presence_penalties(
-            candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
-          )
-          # temperature sampling
-          @context.sample_top_k(candidates, k: top_k)
-          @context.sample_tail_free(candidates, z: tfs_z)
-          @context.sample_typical(candidates, prob: typical_p)
-          @context.sample_top_p(candidates, prob: top_p)
-          @context.sample_temperature(candidates, temperature: temperature)
-          id = @context.sample_token(candidates)
-          last_n_tokens.shift
-          last_n_tokens.push(id)
-          last_n_tokens.shift
-          last_n_tokens.push(id)
-          embd.push(id)
-          n_remain -= 1
-        else
-          while embd_input.size > n_consumed
-            embd.push(embd_input[n_consumed])
-            last_n_tokens.shift
-            last_n_tokens.push(embd_input[n_consumed])
-            n_consumed += 1
-            break if embd.size >= n_batch
-          end
-        end
-        embd.each { |token| output << @context.token_to_str(token) }
-        break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
-      end
-      output.join.delete_prefix(" #{prompt}").strip
-    end
-    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
-    # def chat(prompt); end
-    # Obtains the embedding for a given text.
-    #
-    # @param text [String] The text to obtain the embedding for.
-    # @return [Array<Float>]
-    def embeddings(text)
-      raise 'The embedding option is set to false' unless @params[:embedding]
-      embd_input = tokenize_prompt(text)
-      raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
-      @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
-      @context.embeddings
-    end
-    private
-    def tokenize_prompt(prompt)
-      @context.tokenize(text: " #{prompt}", add_bos: true)
-    end
-  end
-end