RubyGems - llama_cpp - Versions diffs - 0.11.0 → 0.12.0 - Mend

llama_cpp 0.11.0 → 0.12.0

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 58b6e91201c53b1ced4db60f325d3ced3fa486e24a84d53b0e5c62f613e33fc9
-  data.tar.gz: 7b1c4594a79c8ac86aef84be3608dbd51e397c8fe4226d65b3ee87aa1fc800b2
+  metadata.gz: 87010edca1b352ae7bdd3a693451893b13dd75e9e109f9e2b42f6164cc186b08
+  data.tar.gz: ff34254b6377698903dcf771663b91c3c804111228888d96e91363bd0f29d3a6
 SHA512:
-  metadata.gz: aece2e7a49f08d0799ff6eb24904ef176fc916eeb57380916b2c8397ea3236991b52fd806aa8c76822a7c1beac86348f3ceb7094880c8d79015debc62babaa0c
-  data.tar.gz: 2049d26027e8be4e47bbbb12a9a521776c369ca45d05743dec3c96249a09fe67e31a21aa09dcb8d717f39ee29904ee082bcbfa292fd6c1e956d6e319809ca31c
+  metadata.gz: a23aa59fa4936940b28942398bfe98bdb09574162943ebaff31cdbda19394c7690f6c780f49da31eecc4b77427718a8b7ee58e62b2adb087100e1eee66310abc
+  data.tar.gz: 5cc105e69fc81d4616d93cd036af70f809be0c99b9155a6d3e386c9900ca012123353c23417ce56a5a64a1d805108b35de2d9feb5a6265c110d9341e5a2e242b

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,18 @@
+## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
+- Add `get_one` singleton method to `Batch`.
+**Breaking Changes**
+- Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
+- Change to avoid using deprecated methods on `generate` method and example scripts.
+## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
+- Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
+- Add document for `logits` method in `Context`.
+- Add example script for simple text completion: examples/simple.rb
 ## [[0.11.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.3...v0.11.0)] - 2024-01-07
 - Add `set_n_seq_id` and `get_n_seq_id` methods to `Batch`.

data/README.md CHANGED Viewed

@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
 $ gem install llama_cpp
 ```
-There are several installation options for improving execution performance:
+There are several installation options:
 ```sh
 # use OpenBLAS
 $ gem install llama_cpp -- --with-openblas
-# use Metal on macOS
-$ gem install llama_cpp -- --with-metal
+# use CUDA
+$ gem install llama_cpp -- --with-cuda
 ```
 Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.

data/examples/chat.rb CHANGED Viewed

@@ -9,6 +9,7 @@
 require 'llama_cpp'
 require 'thor'
 require 'readline'
+require 'etc'
 class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   default_command :main
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
   option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
   option :temp, type: :numeric, desc: 'temperature', default: 0.8
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
   def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
     mdl_params = LLaMACpp::ModelParams.new
     mdl_params.n_gpu_layers = options[:n_gpu_layers]
     model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
     ctx_params = LLaMACpp::ContextParams.new
     ctx_params.seed = options[:seed] if options[:seed] != -1
+    ctx_params.n_threads = options[:n_threads]
+    ctx_params.n_threads_batch = options[:n_threads]
     context = LLaMACpp::Context.new(model: model, params: ctx_params)
     antiprompt = options[:reverse_prompt] || 'User:'
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         0.step(embd.size - 1, options[:batch_size]) do |i|
           n_eval = [options[:batch_size], embd.size - i].min
-          context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
+          context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
           n_past += n_eval
         end
       end
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
         context.sample_tail_free(candidates, z: options[:tfs_z])
         context.sample_typical(candidates, prob: options[:typical_p])
         context.sample_top_p(candidates, prob: options[:top_p])
-        context.sample_temperature(candidates, temperature: options[:temp])
+        context.sample_temp(candidates, temp: options[:temp])
         id = context.sample_token(candidates)
         last_n_tokens.shift

data/examples/embedding.rb CHANGED Viewed

@@ -7,6 +7,7 @@
 require 'llama_cpp'
 require 'thor'
+require 'etc'
 class Embedding < Thor # rubocop:disable Style/Documentation
   default_command :main
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
   option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
   option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
   option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
   def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
     mdl_params = LLaMACpp::ModelParams.new
     mdl_params.n_gpu_layers = options[:n_gpu_layers]
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
     ctx_params = LLaMACpp::ContextParams.new
     ctx_params.embedding = true
     ctx_params.seed = options[:seed] if options[:seed] != -1
+    ctx_params.n_threads = options[:n_threads]
+    ctx_params.n_threads_batch = options[:n_threads]
     context = LLaMACpp::Context.new(model: model, params: ctx_params)
     embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
     return unless embd_input.size.positive?
-    context.eval(tokens: embd_input, n_past: 0)
+    context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
     context.embeddings.each { |val| print("#{val} ") }
     print("\n")

data/examples/simple.rb ADDED Viewed

@@ -0,0 +1,96 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+# simple.rb is a simple text completion script.
+# It is created with reference to simple.cpp in llama.cpp examples:
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/simple/simple.cpp
+require 'llama_cpp'
+require 'thor'
+require 'etc'
+class Simple < Thor # rubocop:disable Style/Documentation
+  default_command :main
+  desc 'main', 'Simple completion'
+  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
+  option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
+  option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
+  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    n_len = 32
+    model_params = LLaMACpp::ModelParams.new
+    model = LLaMACpp::Model.new(model_path: options[:model], params: model_params)
+    context_params = LLaMACpp::ContextParams.new
+    context_params.seed = 1234
+    context_params.n_ctx = 2048
+    context_params.logits_all = true
+    context_params.n_threads = options[:n_threads]
+    context_params.n_threads_batch = options[:n_threads]
+    context = LLaMACpp::Context.new(model: model, params: context_params)
+    tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
+    n_ctx = context.n_ctx
+    n_kv_req = tokens_list.size + (n_len - tokens_list.size)
+    raise 'n_kv_req > n_ctx, the required KV cache size is not big enough' if n_kv_req > n_ctx
+    print("\nmain: n_len = #{n_len}, n_ctx = #{n_ctx}, n_kv_req = #{n_kv_req}\n\n")
+    tokens_list.each { |token| print(context.model.token_to_piece(token)) }
+    batch = LLaMACpp::Batch.new(max_n_token: 512, n_embd: 0, max_n_seq: 1)
+    tokens_list.each_with_index do |token, id|
+      batch.set_token(batch.n_tokens, token)
+      batch.set_pos(batch.n_tokens, id)
+      batch.set_n_seq_id(batch.n_tokens, 1)
+      batch.set_seq_id(batch.n_tokens, 0, 0)
+      batch.set_logits(batch.n_tokens, false)
+      batch.n_tokens = batch.n_tokens + 1
+    end
+    batch.set_logits(batch.n_tokens - 1, true)
+    context.decode(batch)
+    n_cur = batch.n_tokens
+    n_decode = 0
+    n_vocab = context.model.n_vocab
+    t_start = Time.now
+    while n_cur <= n_len
+      logits = context.logits[((batch.n_tokens - 1) * n_vocab)..]
+      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i] || 0.0, p: 0.0) }
+      candidates = LLaMACpp::TokenDataArray.new(base_candidates)
+      new_token_id = context.sample_token_greedy(candidates)
+      if new_token_id == context.model.token_eos || n_cur == n_len
+        print("\n")
+        break
+      end
+      print(context.model.token_to_piece(new_token_id))
+      batch.n_tokens = 0
+      batch.set_token(batch.n_tokens, new_token_id)
+      batch.set_pos(batch.n_tokens, n_cur)
+      batch.set_n_seq_id(batch.n_tokens, 1)
+      batch.set_seq_id(batch.n_tokens, 0, 0)
+      batch.set_logits(batch.n_tokens, true)
+      batch.n_tokens = batch.n_tokens + 1
+      n_decode += 1
+      n_cur += 1
+      context.decode(batch)
+    end
+    t_end = Time.now
+    print("\nmain: decoded #{n_decode} tokens in #{(t_end - t_start).floor(2)} s, speed: #{n_decode.fdiv(t_end - t_start).floor(2)} t/s\n\n")
+    LLaMACpp.backend_free
+  end
+end
+Simple.start(ARGV)

data/ext/llama_cpp/llama_cpp.cpp CHANGED Viewed

@@ -51,6 +51,7 @@ public:
   static void define_class(VALUE outer) {
     rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
     rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
+    rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
     rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
     rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
     rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
@@ -75,6 +76,48 @@ public:
 private:
   static const rb_data_type_t llama_batch_type;
+  static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
+    VALUE kw_args = Qnil;
+    ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
+    VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
+    rb_scan_args(argc, argv, ":", &kw_args);
+    rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
+    if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
+      rb_raise(rb_eArgError, "tokens must be an array");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[1])) {
+      rb_raise(rb_eArgError, "n_tokens must be an integer");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[2])) {
+      rb_raise(rb_eArgError, "pos_zero must be an integer");
+      return Qnil;
+    }
+    if (!RB_INTEGER_TYPE_P(kw_values[3])) {
+      rb_raise(rb_eArgError, "seq_id must be an integer");
+      return Qnil;
+    }
+    const size_t sz_array = RARRAY_LEN(kw_values[0]);
+    const int32_t n_tokens = NUM2INT(kw_values[1]);
+    const llama_pos pos_zero = NUM2INT(kw_values[2]);
+    const llama_seq_id seq_id = NUM2INT(kw_values[3]);
+    LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
+    new (ptr) LLaMABatchWrapper();
+    ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
+    ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
+    for (size_t i = 0; i < sz_array; i++) {
+      VALUE el = rb_ary_entry(kw_values[0], i);
+      ptr->batch.token[i] = NUM2INT(el);
+    }
+    return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
+  }
   static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
     VALUE kw_args = Qnil;
     ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
@@ -2054,6 +2097,8 @@ private:
     rb_scan_args(argc, argv, ":", &kw_args);
     rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
+    rb_warn("eval is deprecated. Use decode instead.");
     if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
       rb_raise(rb_eArgError, "tokens must be an Array");
       return Qnil;
@@ -2104,6 +2149,8 @@ private:
     rb_scan_args(argc, argv, ":", &kw_args);
     rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
+    rb_warn("eval_embd is deprecated. Use decode instead.");
     if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
       rb_raise(rb_eArgError, "tokens must be an Array");
       return Qnil;
@@ -2162,6 +2209,8 @@ private:
       rb_raise(rb_eRuntimeError, "Failed to decode");
       return Qnil;
     }
+    rb_iv_set(self, "@n_tokens", INT2NUM(batch_ptr->batch.n_tokens));
+    rb_iv_set(self, "@has_evaluated", Qtrue);
     return Qnil;
   }
@@ -2792,6 +2841,8 @@ private:
     rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
     rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
+    rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
     if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
       rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
       return Qnil;

data/lib/llama_cpp/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 # llama_cpp.rb provides Ruby bindings for the llama.cpp.
 module LLaMACpp
   # The version of llama_cpp.rb you install.
-  VERSION = '0.11.0'
+  VERSION = '0.12.0'
   # The version of llama.cpp bundled with llama_cpp.rb.
   LLAMA_CPP_VERSION = 'b1768'

data/lib/llama_cpp.rb CHANGED Viewed

@@ -54,7 +54,7 @@ module LLaMACpp
           embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
         end
-        context.eval(tokens: embd, n_past: n_past)
+        context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
       end
       n_past += embd.size
@@ -77,7 +77,7 @@ module LLaMACpp
         context.sample_tail_free(candidates, z: tfs_z)
         context.sample_typical(candidates, prob: typical_p)
         context.sample_top_p(candidates, prob: top_p)
-        context.sample_temperature(candidates, temperature: temperature)
+        context.sample_temp(candidates, temp: temperature)
         id = context.sample_token(candidates)
         last_n_tokens.shift

data/sig/llama_cpp.rbs CHANGED Viewed

@@ -149,6 +149,7 @@ module LLaMACpp
   class Batch
     public
+    def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
     def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
     def n_tokens=: (Integer) -> Integer
     def n_tokens: () -> Integer

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llama_cpp
 version: !ruby/object:Gem::Version
-  version: 0.11.0
+  version: 0.12.0
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-01-07 00:00:00.000000000 Z
+date: 2024-01-11 00:00:00.000000000 Z
 dependencies: []
 description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
 email:
@@ -26,6 +26,7 @@ files:
 - examples/chat.rb
 - examples/embedding.rb
 - examples/prompt_jp.txt
+- examples/simple.rb
 - ext/llama_cpp/extconf.rb
 - ext/llama_cpp/llama_cpp.cpp
 - ext/llama_cpp/llama_cpp.h