RubyGems - llama_cpp - Versions diffs - 0.0.5 → 0.0.7 - Mend

llama_cpp 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -0
data/ext/llama_cpp/extconf.rb +24 -1
data/ext/llama_cpp/llama_cpp.cpp +72 -0
data/ext/llama_cpp/src/ggml-cuda.h +44 -0
data/ext/llama_cpp/src/ggml-opencl.c +216 -0
data/ext/llama_cpp/src/ggml-opencl.h +24 -0
data/ext/llama_cpp/src/ggml.c +2324 -969
data/ext/llama_cpp/src/ggml.h +656 -619
data/ext/llama_cpp/src/llama.cpp +269 -42
data/ext/llama_cpp/src/llama.h +22 -14
data/ext/llama_cpp/src/llama_util.h +15 -3
data/lib/llama_cpp/client.rb +151 -0
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +16 -8
data/sig/llama_cpp.rbs +26 -2
metadata +6 -2

data/lib/llama_cpp/client.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# frozen_string_literal: true
+module LLaMACpp
+  # Client provides a high-level interface to the LLM model.
+  class Client
+    # Creates a new client.
+    #
+    # @param model_path [String] The path to the model file.
+    # @param lora_adapter_path [String] The path to the LoRA adapter file.
+    # @param lora_base_path [String] The path to the LoRA base model file.
+    # @param n_ctx [Integer] The context size.
+    # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
+    # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
+    # @param use_mmap [Boolean] The flag whether to use mmap.
+    # @param use_mlock [Boolean] The flag hether to use mlock.
+    # @param embedding [Boolean] The flag whether to calculate embedding.
+    # @param n_threads [Integer] The number of threads to use.
+    # @param seed [Integer] The seed for the random number generator.
+    # @return [Client]
+    # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
+    def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
+                   n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
+                   embedding: false,
+                   n_threads: 1, seed: 0)
+      @params = {
+        model_path: model_path,
+        lora_adapter_path: lora_adapter_path,
+        lora_base_path: lora_base_path,
+        n_ctx: n_ctx,
+        n_parts: n_parts,
+        memory_f16: memory_f16,
+        use_mmap: use_mmap,
+        use_mlock: use_mlock,
+        embedding: embedding,
+        n_threads: n_threads,
+        seed: seed
+      }
+      @context_params = ContextParams.new
+      @context_params.n_ctx = n_ctx
+      @context_params.n_parts = n_parts
+      @context_params.f16_kv = memory_f16
+      @context_params.use_mmap = use_mmap
+      @context_params.use_mlock = use_mlock
+      @context_params.embedding = embedding
+      @context_params.seed = seed
+      @context = Context.new(model_path: model_path, params: @context_params)
+      return unless lora_adapter_path.is_a?(String)
+      if lora_base_path.is_a?(String)
+        @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
+      else
+        @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
+      end
+    end
+    # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
+    # Generates completions for a given prompt.
+    #
+    # @param prompt [String] The prompt to generate completions for.
+    # @param max_tokens [Integer] The maximum number of tokens to generate.
+    # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
+    # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
+    # @param n_batch [Integer] The batch size.
+    # @param top_k [Integer] The top-k value.
+    # @param top_p [Float] The top-p value.
+    # @param temperature [Float] The temperature value.
+    # @param repeat_penalty [Float] The repeat penalty value.
+    # @return [String]
+    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
+    def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
+                    top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
+      embd_input = tokenize_prompt(prompt)
+      n_ctx = @context.n_ctx
+      raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
+      last_n_tokens = [0] * n_ctx
+      embd = []
+      n_consumed = 0
+      n_past = 0
+      n_remain = max_tokens
+      output = []
+      while n_remain != 0
+        unless embd.empty?
+          if n_past + embd.size > n_ctx
+            n_left = n_past - n_keep
+            n_past = n_keep
+            embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
+          end
+          @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
+        end
+        n_past += embd.size
+        embd.clear
+        if embd_input.size <= n_consumed
+          start = n_ctx - repeat_last_n
+          id = @context.sample_top_p_top_k(
+            last_n_tokens[start...(start + repeat_last_n)],
+            top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
+          )
+          last_n_tokens.shift
+          last_n_tokens.push(id)
+          embd.push(id)
+          n_remain -= 1
+        else
+          while embd_input.size > n_consumed
+            embd.push(embd_input[n_consumed])
+            last_n_tokens.shift
+            last_n_tokens.push(embd_input[n_consumed])
+            n_consumed += 1
+            break if embd.size >= n_batch
+          end
+        end
+        embd.each { |token| output << @context.token_to_str(token) }
+        break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
+      end
+      output.join.delete_prefix(" #{prompt}").strip
+    end
+    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
+    # def chat(prompt); end
+    # Obtains the embedding for a given text.
+    #
+    # @param text [String] The text to obtain the embedding for.
+    # @return [Array<Float>]
+    def embeddings(text)
+      raise 'The embedding option is set to false' unless @params[:embedding]
+      embd_input = tokenize_prompt(text)
+      raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
+      @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
+      @context.embeddings
+    end
+    private
+    def tokenize_prompt(prompt)
+      @context.tokenize(text: " #{prompt}", add_bos: true)
+    end
+  end
+end

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.0.5'
+  VERSION = '0.0.7'
   # The version of llama.cpp bundled with llama_cpp.rb.
-  LLAMA_CPP_VERSION = 'master-315a95a'
+  LLAMA_CPP_VERSION = 'master-11d9023'
 end

data/lib/llama_cpp.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative 'llama_cpp/version'
 require_relative 'llama_cpp/llama_cpp'
+require_relative 'llama_cpp/client'
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
@@ -12,24 +13,31 @@ module LLaMACpp
   # Generates sentences following the given prompt for operation check.
   #
-  # @param context [LLaMACpp::Context]
-  # @param prompt [String]
-  # @param n_threads [Integer]
+  # @param context [LLaMACpp::Context] The context to use.
+  # @param prompt [String] The prompt to start generation with.
+  # @param n_predict [Integer] The number of tokens to predict.
+  # @param n_threads [Integer] The number of threads.
   # @return [String]
-  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
-    spaced_prompt = " #{prompt}"
+  def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+    raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
+    raise ArgumentError, 'context must have loaded the model' if context.empty?
+    raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
+    spaced_prompt = " #{prompt}"
     embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
     n_ctx = context.n_ctx
+    raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
     last_n_tokens = [0] * n_ctx
     embd = []
     n_consumed = 0
     n_keep = 10
     n_past = 0
-    n_remain = 128
+    n_remain = n_predict
     repeat_last_n = 64
+    n_batch = 512
     output = []
     while n_remain != 0
@@ -62,13 +70,13 @@ module LLaMACpp
           last_n_tokens.shift
           last_n_tokens.push(embd_input[n_consumed])
           n_consumed += 1
-          break if embd.size >= 512
+          break if embd.size >= n_batch
         end
       end
       embd.each { |token| output << context.token_to_str(token) }
-      break if embd[-1] == LLaMACpp.token_eos
+      break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
     end
     output.join.delete_prefix(spaced_prompt).strip

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -5,7 +5,19 @@ module LLaMACpp
   LLAMA_FILE_MAGIC: String
   LLAMA_FILE_MAGIC_UNVERSIONED: String
-  def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
+  LLAMA_FTYPE_ALL_F32: Integer
+  LLAMA_FTYPE_MOSTLY_F16: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
+  LLAMA_FTYPE_MOSTLY_Q4_3: Integer
+  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
+  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
+  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
+  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
   def self?.print_system_info: () -> void
   def self?.token_bos: () -> Integer
   def self?.token_eos: () -> Integer
@@ -18,7 +30,8 @@ module LLaMACpp
     def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
                   | () -> void
     def embeddings: () -> Array[Float]
-    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
+    def empty?: () -> bool
+    def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
     def free: () -> void
     def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
     def logits: () -> Array[Float]
@@ -50,9 +63,20 @@ module LLaMACpp
     def seed=: (Integer) -> Integer
     def use_mlock: () -> bool
     def use_mlock=: (bool) -> bool
+    def use_mmap: () -> bool
+    def use_mmap=: (bool) -> bool
     def vocab_only: () -> bool
     def vocab_only=: (bool) -> bool
   end
   class Params = ContextParams
+  class Client
+    def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
+                   ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
+                   ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
+    def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
+                    ?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
+    def embeddings(String) -> Array[Float]
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.7
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-04-20 00:00:00.000000000 Z
+date: 2023-04-29 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -26,12 +26,16 @@ files:
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h
 - ext/llama_cpp/src/LICENSE
+- ext/llama_cpp/src/ggml-cuda.h
+- ext/llama_cpp/src/ggml-opencl.c
+- ext/llama_cpp/src/ggml-opencl.h
 - ext/llama_cpp/src/ggml.c
 - ext/llama_cpp/src/ggml.h
 - ext/llama_cpp/src/llama.cpp
 - ext/llama_cpp/src/llama.h
 - ext/llama_cpp/src/llama_util.h
 - lib/llama_cpp.rb
+- lib/llama_cpp/client.rb
 - lib/llama_cpp/version.rb
 - sig/llama_cpp.rbs
 homepage: https://github.com/yoshoku/llama_cpp.rb