llama_cpp 0.11.1 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 705f8a7e2228a324f14fa23ca093e2ce4408eacb839d891284c40e319b07940a
4
- data.tar.gz: fc04d232f2c7ecfa1402aa711eda63e36a03d287a6bc7c8e0d2c791194ad9e9a
3
+ metadata.gz: 87010edca1b352ae7bdd3a693451893b13dd75e9e109f9e2b42f6164cc186b08
4
+ data.tar.gz: ff34254b6377698903dcf771663b91c3c804111228888d96e91363bd0f29d3a6
5
5
  SHA512:
6
- metadata.gz: 888ba5bfa23ab51746d49c2cc071f8a220d3de39f6c3a34576f35bcb993fc0be841481dcbca9762504397ca6555571f43b4ba2c0ae3dae3fcd8d29bd2735ae16
7
- data.tar.gz: e8940f8ab7a542569a71ad5c869ac781b8ad958ca02d2a5547b792008c228ae1e9ff23cc5b9552e83b0c631805ec7edfea6138c5ae9d922daca06e5ab3f1490d
6
+ metadata.gz: a23aa59fa4936940b28942398bfe98bdb09574162943ebaff31cdbda19394c7690f6c780f49da31eecc4b77427718a8b7ee58e62b2adb087100e1eee66310abc
7
+ data.tar.gz: 5cc105e69fc81d4616d93cd036af70f809be0c99b9155a6d3e386c9900ca012123353c23417ce56a5a64a1d805108b35de2d9feb5a6265c110d9341e5a2e242b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
2
+
3
+ - Add `get_one` singleton method to `Batch`.
4
+
5
+ **Breaking Changes**
6
+
7
+ - Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
8
+ - Change to avoid using deprecated methods on `generate` method and example scripts.
9
+
1
10
  ## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
2
11
 
3
12
  - Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
data/README.md CHANGED
@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
22
22
  $ gem install llama_cpp
23
23
  ```
24
24
 
25
- There are several installation options for improving execution performance:
25
+ There are several installation options:
26
26
 
27
27
  ```sh
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use Metal on macOS
32
- $ gem install llama_cpp -- --with-metal
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
data/examples/chat.rb CHANGED
@@ -9,6 +9,7 @@
9
9
  require 'llama_cpp'
10
10
  require 'thor'
11
11
  require 'readline'
12
+ require 'etc'
12
13
 
13
14
  class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
15
  default_command :main
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
30
31
  option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
31
32
  option :temp, type: :numeric, desc: 'temperature', default: 0.8
32
33
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
33
35
  def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
34
36
  mdl_params = LLaMACpp::ModelParams.new
35
37
  mdl_params.n_gpu_layers = options[:n_gpu_layers]
36
38
  model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
37
39
  ctx_params = LLaMACpp::ContextParams.new
38
40
  ctx_params.seed = options[:seed] if options[:seed] != -1
41
+ ctx_params.n_threads = options[:n_threads]
42
+ ctx_params.n_threads_batch = options[:n_threads]
39
43
  context = LLaMACpp::Context.new(model: model, params: ctx_params)
40
44
 
41
45
  antiprompt = options[:reverse_prompt] || 'User:'
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
70
74
 
71
75
  0.step(embd.size - 1, options[:batch_size]) do |i|
72
76
  n_eval = [options[:batch_size], embd.size - i].min
73
- context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
77
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
74
78
  n_past += n_eval
75
79
  end
76
80
  end
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
95
99
  context.sample_tail_free(candidates, z: options[:tfs_z])
96
100
  context.sample_typical(candidates, prob: options[:typical_p])
97
101
  context.sample_top_p(candidates, prob: options[:top_p])
98
- context.sample_temperature(candidates, temperature: options[:temp])
102
+ context.sample_temp(candidates, temp: options[:temp])
99
103
  id = context.sample_token(candidates)
100
104
 
101
105
  last_n_tokens.shift
@@ -7,6 +7,7 @@
7
7
 
8
8
  require 'llama_cpp'
9
9
  require 'thor'
10
+ require 'etc'
10
11
 
11
12
  class Embedding < Thor # rubocop:disable Style/Documentation
12
13
  default_command :main
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
15
16
  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
16
17
  option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
17
18
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
18
20
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
19
21
  mdl_params = LLaMACpp::ModelParams.new
20
22
  mdl_params.n_gpu_layers = options[:n_gpu_layers]
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
22
24
  ctx_params = LLaMACpp::ContextParams.new
23
25
  ctx_params.embedding = true
24
26
  ctx_params.seed = options[:seed] if options[:seed] != -1
27
+ ctx_params.n_threads = options[:n_threads]
28
+ ctx_params.n_threads_batch = options[:n_threads]
25
29
  context = LLaMACpp::Context.new(model: model, params: ctx_params)
26
30
 
27
31
  embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
28
32
 
29
33
  return unless embd_input.size.positive?
30
34
 
31
- context.eval(tokens: embd_input, n_past: 0)
35
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
32
36
 
33
37
  context.embeddings.each { |val| print("#{val} ") }
34
38
  print("\n")
data/examples/simple.rb CHANGED
@@ -7,12 +7,14 @@
7
7
 
8
8
  require 'llama_cpp'
9
9
  require 'thor'
10
+ require 'etc'
10
11
 
11
12
  class Simple < Thor # rubocop:disable Style/Documentation
12
13
  default_command :main
13
14
  desc 'main', 'Simple completion'
14
15
  option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
15
16
  option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
17
+ option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
16
18
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
17
19
  n_len = 32
18
20
  model_params = LLaMACpp::ModelParams.new
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
21
23
  context_params.seed = 1234
22
24
  context_params.n_ctx = 2048
23
25
  context_params.logits_all = true
24
- context_params.n_threads = 4
26
+ context_params.n_threads = options[:n_threads]
27
+ context_params.n_threads_batch = options[:n_threads]
25
28
  context = LLaMACpp::Context.new(model: model, params: context_params)
26
29
 
27
30
  tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
@@ -51,6 +51,7 @@ public:
51
51
  static void define_class(VALUE outer) {
52
52
  rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
53
53
  rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
54
+ rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
54
55
  rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
55
56
  rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
56
57
  rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
@@ -75,6 +76,48 @@ public:
75
76
  private:
76
77
  static const rb_data_type_t llama_batch_type;
77
78
 
79
+ static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
80
+ VALUE kw_args = Qnil;
81
+ ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
82
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
83
+ rb_scan_args(argc, argv, ":", &kw_args);
84
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
85
+
86
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
87
+ rb_raise(rb_eArgError, "tokens must be an array");
88
+ return Qnil;
89
+ }
90
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
91
+ rb_raise(rb_eArgError, "n_tokens must be an integer");
92
+ return Qnil;
93
+ }
94
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
95
+ rb_raise(rb_eArgError, "pos_zero must be an integer");
96
+ return Qnil;
97
+ }
98
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
99
+ rb_raise(rb_eArgError, "seq_id must be an integer");
100
+ return Qnil;
101
+ }
102
+
103
+ const size_t sz_array = RARRAY_LEN(kw_values[0]);
104
+ const int32_t n_tokens = NUM2INT(kw_values[1]);
105
+ const llama_pos pos_zero = NUM2INT(kw_values[2]);
106
+ const llama_seq_id seq_id = NUM2INT(kw_values[3]);
107
+
108
+ LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
109
+ new (ptr) LLaMABatchWrapper();
110
+ ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
111
+
112
+ ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
113
+ for (size_t i = 0; i < sz_array; i++) {
114
+ VALUE el = rb_ary_entry(kw_values[0], i);
115
+ ptr->batch.token[i] = NUM2INT(el);
116
+ }
117
+
118
+ return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
119
+ }
120
+
78
121
  static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
79
122
  VALUE kw_args = Qnil;
80
123
  ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
@@ -2054,6 +2097,8 @@ private:
2054
2097
  rb_scan_args(argc, argv, ":", &kw_args);
2055
2098
  rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2056
2099
 
2100
+ rb_warn("eval is deprecated. Use decode instead.");
2101
+
2057
2102
  if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2058
2103
  rb_raise(rb_eArgError, "tokens must be an Array");
2059
2104
  return Qnil;
@@ -2104,6 +2149,8 @@ private:
2104
2149
  rb_scan_args(argc, argv, ":", &kw_args);
2105
2150
  rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
2106
2151
 
2152
+ rb_warn("eval_embd is deprecated. Use decode instead.");
2153
+
2107
2154
  if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
2108
2155
  rb_raise(rb_eArgError, "tokens must be an Array");
2109
2156
  return Qnil;
@@ -2794,6 +2841,8 @@ private:
2794
2841
  rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
2795
2842
  rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
2796
2843
 
2844
+ rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
2845
+
2797
2846
  if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
2798
2847
  rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
2799
2848
  return Qnil;
@@ -3,7 +3,7 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.11.1'
6
+ VERSION = '0.12.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
9
  LLAMA_CPP_VERSION = 'b1768'
data/lib/llama_cpp.rb CHANGED
@@ -54,7 +54,7 @@ module LLaMACpp
54
54
  embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
55
55
  end
56
56
 
57
- context.eval(tokens: embd, n_past: n_past)
57
+ context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
58
58
  end
59
59
 
60
60
  n_past += embd.size
@@ -77,7 +77,7 @@ module LLaMACpp
77
77
  context.sample_tail_free(candidates, z: tfs_z)
78
78
  context.sample_typical(candidates, prob: typical_p)
79
79
  context.sample_top_p(candidates, prob: top_p)
80
- context.sample_temperature(candidates, temperature: temperature)
80
+ context.sample_temp(candidates, temp: temperature)
81
81
  id = context.sample_token(candidates)
82
82
 
83
83
  last_n_tokens.shift
data/sig/llama_cpp.rbs CHANGED
@@ -149,6 +149,7 @@ module LLaMACpp
149
149
  class Batch
150
150
  public
151
151
 
152
+ def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
152
153
  def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
153
154
  def n_tokens=: (Integer) -> Integer
154
155
  def n_tokens: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-08 00:00:00.000000000 Z
11
+ date: 2024-01-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: