llama_cpp 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +3 -3
- data/examples/chat.rb +6 -2
- data/examples/embedding.rb +5 -1
- data/examples/simple.rb +4 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +1 -1
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87010edca1b352ae7bdd3a693451893b13dd75e9e109f9e2b42f6164cc186b08
|
4
|
+
data.tar.gz: ff34254b6377698903dcf771663b91c3c804111228888d96e91363bd0f29d3a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a23aa59fa4936940b28942398bfe98bdb09574162943ebaff31cdbda19394c7690f6c780f49da31eecc4b77427718a8b7ee58e62b2adb087100e1eee66310abc
|
7
|
+
data.tar.gz: 5cc105e69fc81d4616d93cd036af70f809be0c99b9155a6d3e386c9900ca012123353c23417ce56a5a64a1d805108b35de2d9feb5a6265c110d9341e5a2e242b
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
2
|
+
|
3
|
+
- Add `get_one` singleton method to `Batch`.
|
4
|
+
|
5
|
+
**Breaking Changes**
|
6
|
+
|
7
|
+
- Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
|
8
|
+
- Change to avoid using deprecated methods on `generate` method and example scripts.
|
9
|
+
|
1
10
|
## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
|
2
11
|
|
3
12
|
- Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
|
data/README.md
CHANGED
@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
22
22
|
$ gem install llama_cpp
|
23
23
|
```
|
24
24
|
|
25
|
-
There are several installation options
|
25
|
+
There are several installation options:
|
26
26
|
|
27
27
|
```sh
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/examples/chat.rb
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
require 'llama_cpp'
|
10
10
|
require 'thor'
|
11
11
|
require 'readline'
|
12
|
+
require 'etc'
|
12
13
|
|
13
14
|
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
14
15
|
default_command :main
|
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
30
31
|
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
31
32
|
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
32
33
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
33
35
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
34
36
|
mdl_params = LLaMACpp::ModelParams.new
|
35
37
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
36
38
|
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
37
39
|
ctx_params = LLaMACpp::ContextParams.new
|
38
40
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
+
ctx_params.n_threads = options[:n_threads]
|
42
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
39
43
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
40
44
|
|
41
45
|
antiprompt = options[:reverse_prompt] || 'User:'
|
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
70
74
|
|
71
75
|
0.step(embd.size - 1, options[:batch_size]) do |i|
|
72
76
|
n_eval = [options[:batch_size], embd.size - i].min
|
73
|
-
context.
|
77
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
74
78
|
n_past += n_eval
|
75
79
|
end
|
76
80
|
end
|
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
95
99
|
context.sample_tail_free(candidates, z: options[:tfs_z])
|
96
100
|
context.sample_typical(candidates, prob: options[:typical_p])
|
97
101
|
context.sample_top_p(candidates, prob: options[:top_p])
|
98
|
-
context.
|
102
|
+
context.sample_temp(candidates, temp: options[:temp])
|
99
103
|
id = context.sample_token(candidates)
|
100
104
|
|
101
105
|
last_n_tokens.shift
|
data/examples/embedding.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Embedding < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
15
16
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
16
17
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
17
18
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
18
20
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
19
21
|
mdl_params = LLaMACpp::ModelParams.new
|
20
22
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
22
24
|
ctx_params = LLaMACpp::ContextParams.new
|
23
25
|
ctx_params.embedding = true
|
24
26
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
+
ctx_params.n_threads = options[:n_threads]
|
28
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
25
29
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
26
30
|
|
27
31
|
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
28
32
|
|
29
33
|
return unless embd_input.size.positive?
|
30
34
|
|
31
|
-
context.
|
35
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
32
36
|
|
33
37
|
context.embeddings.each { |val| print("#{val} ") }
|
34
38
|
print("\n")
|
data/examples/simple.rb
CHANGED
@@ -7,12 +7,14 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Simple < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
13
14
|
desc 'main', 'Simple completion'
|
14
15
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
15
16
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
|
17
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
16
18
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
19
|
n_len = 32
|
18
20
|
model_params = LLaMACpp::ModelParams.new
|
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
|
|
21
23
|
context_params.seed = 1234
|
22
24
|
context_params.n_ctx = 2048
|
23
25
|
context_params.logits_all = true
|
24
|
-
context_params.n_threads =
|
26
|
+
context_params.n_threads = options[:n_threads]
|
27
|
+
context_params.n_threads_batch = options[:n_threads]
|
25
28
|
context = LLaMACpp::Context.new(model: model, params: context_params)
|
26
29
|
|
27
30
|
tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -51,6 +51,7 @@ public:
|
|
51
51
|
static void define_class(VALUE outer) {
|
52
52
|
rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
|
53
53
|
rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
|
54
|
+
rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
|
54
55
|
rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
|
55
56
|
rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
|
56
57
|
rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
|
@@ -75,6 +76,48 @@ public:
|
|
75
76
|
private:
|
76
77
|
static const rb_data_type_t llama_batch_type;
|
77
78
|
|
79
|
+
static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
|
80
|
+
VALUE kw_args = Qnil;
|
81
|
+
ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
|
82
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
83
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
84
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
85
|
+
|
86
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
87
|
+
rb_raise(rb_eArgError, "tokens must be an array");
|
88
|
+
return Qnil;
|
89
|
+
}
|
90
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
91
|
+
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
92
|
+
return Qnil;
|
93
|
+
}
|
94
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
95
|
+
rb_raise(rb_eArgError, "pos_zero must be an integer");
|
96
|
+
return Qnil;
|
97
|
+
}
|
98
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
99
|
+
rb_raise(rb_eArgError, "seq_id must be an integer");
|
100
|
+
return Qnil;
|
101
|
+
}
|
102
|
+
|
103
|
+
const size_t sz_array = RARRAY_LEN(kw_values[0]);
|
104
|
+
const int32_t n_tokens = NUM2INT(kw_values[1]);
|
105
|
+
const llama_pos pos_zero = NUM2INT(kw_values[2]);
|
106
|
+
const llama_seq_id seq_id = NUM2INT(kw_values[3]);
|
107
|
+
|
108
|
+
LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
|
109
|
+
new (ptr) LLaMABatchWrapper();
|
110
|
+
ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
|
111
|
+
|
112
|
+
ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
|
113
|
+
for (size_t i = 0; i < sz_array; i++) {
|
114
|
+
VALUE el = rb_ary_entry(kw_values[0], i);
|
115
|
+
ptr->batch.token[i] = NUM2INT(el);
|
116
|
+
}
|
117
|
+
|
118
|
+
return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
|
119
|
+
}
|
120
|
+
|
78
121
|
static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
|
79
122
|
VALUE kw_args = Qnil;
|
80
123
|
ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
|
@@ -2054,6 +2097,8 @@ private:
|
|
2054
2097
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2055
2098
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2056
2099
|
|
2100
|
+
rb_warn("eval is deprecated. Use decode instead.");
|
2101
|
+
|
2057
2102
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2058
2103
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2059
2104
|
return Qnil;
|
@@ -2104,6 +2149,8 @@ private:
|
|
2104
2149
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2105
2150
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2106
2151
|
|
2152
|
+
rb_warn("eval_embd is deprecated. Use decode instead.");
|
2153
|
+
|
2107
2154
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2108
2155
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2109
2156
|
return Qnil;
|
@@ -2794,6 +2841,8 @@ private:
|
|
2794
2841
|
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2795
2842
|
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
2796
2843
|
|
2844
|
+
rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
|
2845
|
+
|
2797
2846
|
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2798
2847
|
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
2799
2848
|
return Qnil;
|
data/lib/llama_cpp/version.rb
CHANGED
data/lib/llama_cpp.rb
CHANGED
@@ -54,7 +54,7 @@ module LLaMACpp
|
|
54
54
|
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
55
|
end
|
56
56
|
|
57
|
-
context.
|
57
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
58
|
end
|
59
59
|
|
60
60
|
n_past += embd.size
|
@@ -77,7 +77,7 @@ module LLaMACpp
|
|
77
77
|
context.sample_tail_free(candidates, z: tfs_z)
|
78
78
|
context.sample_typical(candidates, prob: typical_p)
|
79
79
|
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.
|
80
|
+
context.sample_temp(candidates, temp: temperature)
|
81
81
|
id = context.sample_token(candidates)
|
82
82
|
|
83
83
|
last_n_tokens.shift
|
data/sig/llama_cpp.rbs
CHANGED
@@ -149,6 +149,7 @@ module LLaMACpp
|
|
149
149
|
class Batch
|
150
150
|
public
|
151
151
|
|
152
|
+
def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
|
152
153
|
def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
|
153
154
|
def n_tokens=: (Integer) -> Integer
|
154
155
|
def n_tokens: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|