llama_cpp 0.11.1 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +3 -3
- data/examples/chat.rb +6 -2
- data/examples/embedding.rb +5 -1
- data/examples/simple.rb +4 -1
- data/ext/llama_cpp/llama_cpp.cpp +63 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +5 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
|
4
|
+
data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
|
7
|
+
data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1768 to b1833.
|
4
|
+
- Add model file type constants.
|
5
|
+
- Add `kv_cache_seq_div` method to `Context`.
|
6
|
+
|
7
|
+
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
8
|
+
|
9
|
+
- Add `get_one` singleton method to `Batch`.
|
10
|
+
|
11
|
+
**Breaking Changes**
|
12
|
+
|
13
|
+
- Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
|
14
|
+
- Change to avoid using deprecated methods on `generate` method and example scripts.
|
15
|
+
|
1
16
|
## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
|
2
17
|
|
3
18
|
- Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
|
data/README.md
CHANGED
@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
22
22
|
$ gem install llama_cpp
|
23
23
|
```
|
24
24
|
|
25
|
-
There are several installation options
|
25
|
+
There are several installation options:
|
26
26
|
|
27
27
|
```sh
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/examples/chat.rb
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
require 'llama_cpp'
|
10
10
|
require 'thor'
|
11
11
|
require 'readline'
|
12
|
+
require 'etc'
|
12
13
|
|
13
14
|
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
14
15
|
default_command :main
|
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
30
31
|
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
31
32
|
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
32
33
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
33
35
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
34
36
|
mdl_params = LLaMACpp::ModelParams.new
|
35
37
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
36
38
|
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
37
39
|
ctx_params = LLaMACpp::ContextParams.new
|
38
40
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
+
ctx_params.n_threads = options[:n_threads]
|
42
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
39
43
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
40
44
|
|
41
45
|
antiprompt = options[:reverse_prompt] || 'User:'
|
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
70
74
|
|
71
75
|
0.step(embd.size - 1, options[:batch_size]) do |i|
|
72
76
|
n_eval = [options[:batch_size], embd.size - i].min
|
73
|
-
context.
|
77
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
74
78
|
n_past += n_eval
|
75
79
|
end
|
76
80
|
end
|
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
95
99
|
context.sample_tail_free(candidates, z: options[:tfs_z])
|
96
100
|
context.sample_typical(candidates, prob: options[:typical_p])
|
97
101
|
context.sample_top_p(candidates, prob: options[:top_p])
|
98
|
-
context.
|
102
|
+
context.sample_temp(candidates, temp: options[:temp])
|
99
103
|
id = context.sample_token(candidates)
|
100
104
|
|
101
105
|
last_n_tokens.shift
|
data/examples/embedding.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Embedding < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
15
16
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
16
17
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
17
18
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
18
20
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
19
21
|
mdl_params = LLaMACpp::ModelParams.new
|
20
22
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
22
24
|
ctx_params = LLaMACpp::ContextParams.new
|
23
25
|
ctx_params.embedding = true
|
24
26
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
+
ctx_params.n_threads = options[:n_threads]
|
28
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
25
29
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
26
30
|
|
27
31
|
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
28
32
|
|
29
33
|
return unless embd_input.size.positive?
|
30
34
|
|
31
|
-
context.
|
35
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
32
36
|
|
33
37
|
context.embeddings.each { |val| print("#{val} ") }
|
34
38
|
print("\n")
|
data/examples/simple.rb
CHANGED
@@ -7,12 +7,14 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Simple < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
13
14
|
desc 'main', 'Simple completion'
|
14
15
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
15
16
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
|
17
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
16
18
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
19
|
n_len = 32
|
18
20
|
model_params = LLaMACpp::ModelParams.new
|
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
|
|
21
23
|
context_params.seed = 1234
|
22
24
|
context_params.n_ctx = 2048
|
23
25
|
context_params.logits_all = true
|
24
|
-
context_params.n_threads =
|
26
|
+
context_params.n_threads = options[:n_threads]
|
27
|
+
context_params.n_threads_batch = options[:n_threads]
|
25
28
|
context = LLaMACpp::Context.new(model: model, params: context_params)
|
26
29
|
|
27
30
|
tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -51,6 +51,7 @@ public:
|
|
51
51
|
static void define_class(VALUE outer) {
|
52
52
|
rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
|
53
53
|
rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
|
54
|
+
rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
|
54
55
|
rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
|
55
56
|
rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
|
56
57
|
rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
|
@@ -75,6 +76,48 @@ public:
|
|
75
76
|
private:
|
76
77
|
static const rb_data_type_t llama_batch_type;
|
77
78
|
|
79
|
+
static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
|
80
|
+
VALUE kw_args = Qnil;
|
81
|
+
ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
|
82
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
83
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
84
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
85
|
+
|
86
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
87
|
+
rb_raise(rb_eArgError, "tokens must be an array");
|
88
|
+
return Qnil;
|
89
|
+
}
|
90
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
91
|
+
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
92
|
+
return Qnil;
|
93
|
+
}
|
94
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
95
|
+
rb_raise(rb_eArgError, "pos_zero must be an integer");
|
96
|
+
return Qnil;
|
97
|
+
}
|
98
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
99
|
+
rb_raise(rb_eArgError, "seq_id must be an integer");
|
100
|
+
return Qnil;
|
101
|
+
}
|
102
|
+
|
103
|
+
const size_t sz_array = RARRAY_LEN(kw_values[0]);
|
104
|
+
const int32_t n_tokens = NUM2INT(kw_values[1]);
|
105
|
+
const llama_pos pos_zero = NUM2INT(kw_values[2]);
|
106
|
+
const llama_seq_id seq_id = NUM2INT(kw_values[3]);
|
107
|
+
|
108
|
+
LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
|
109
|
+
new (ptr) LLaMABatchWrapper();
|
110
|
+
ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
|
111
|
+
|
112
|
+
ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
|
113
|
+
for (size_t i = 0; i < sz_array; i++) {
|
114
|
+
VALUE el = rb_ary_entry(kw_values[0], i);
|
115
|
+
ptr->batch.token[i] = NUM2INT(el);
|
116
|
+
}
|
117
|
+
|
118
|
+
return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
|
119
|
+
}
|
120
|
+
|
78
121
|
static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
|
79
122
|
VALUE kw_args = Qnil;
|
80
123
|
ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
|
@@ -1983,6 +2026,7 @@ public:
|
|
1983
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
1984
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
1985
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
1986
2030
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
1987
2031
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
1988
2032
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
@@ -2054,6 +2098,8 @@ private:
|
|
2054
2098
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2055
2099
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2056
2100
|
|
2101
|
+
rb_warn("eval is deprecated. Use decode instead.");
|
2102
|
+
|
2057
2103
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2058
2104
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2059
2105
|
return Qnil;
|
@@ -2104,6 +2150,8 @@ private:
|
|
2104
2150
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2105
2151
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2106
2152
|
|
2153
|
+
rb_warn("eval_embd is deprecated. Use decode instead.");
|
2154
|
+
|
2107
2155
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2108
2156
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2109
2157
|
return Qnil;
|
@@ -2331,6 +2379,16 @@ private:
|
|
2331
2379
|
return Qnil;
|
2332
2380
|
}
|
2333
2381
|
|
2382
|
+
static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
|
2383
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
|
+
if (ptr->ctx == NULL) {
|
2385
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2386
|
+
return Qnil;
|
2387
|
+
}
|
2388
|
+
llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
|
2389
|
+
return Qnil;
|
2390
|
+
}
|
2391
|
+
|
2334
2392
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2335
2393
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2336
2394
|
if (ptr->ctx == NULL) {
|
@@ -2794,6 +2852,8 @@ private:
|
|
2794
2852
|
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2795
2853
|
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
2796
2854
|
|
2855
|
+
rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
|
2856
|
+
|
2797
2857
|
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2798
2858
|
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
2799
2859
|
return Qnil;
|
@@ -3160,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3160
3220
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
3161
3221
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
3162
3222
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
3223
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3224
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3225
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3163
3226
|
|
3164
3227
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3165
3228
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.12.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1833'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -54,7 +54,7 @@ module LLaMACpp
|
|
54
54
|
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
55
|
end
|
56
56
|
|
57
|
-
context.
|
57
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
58
|
end
|
59
59
|
|
60
60
|
n_past += embd.size
|
@@ -77,7 +77,7 @@ module LLaMACpp
|
|
77
77
|
context.sample_tail_free(candidates, z: tfs_z)
|
78
78
|
context.sample_typical(candidates, prob: typical_p)
|
79
79
|
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.
|
80
|
+
context.sample_temp(candidates, temp: temperature)
|
81
81
|
id = context.sample_token(candidates)
|
82
82
|
|
83
83
|
last_n_tokens.shift
|
data/sig/llama_cpp.rbs
CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
|
|
22
22
|
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
25
28
|
|
26
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
27
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -149,6 +152,7 @@ module LLaMACpp
|
|
149
152
|
class Batch
|
150
153
|
public
|
151
154
|
|
155
|
+
def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
|
152
156
|
def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
|
153
157
|
def n_tokens=: (Integer) -> Integer
|
154
158
|
def n_tokens: () -> Integer
|
@@ -192,6 +196,7 @@ module LLaMACpp
|
|
192
196
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
193
197
|
def kv_cache_seq_keep: (Integer) -> void
|
194
198
|
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
199
|
+
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
195
200
|
def set_rng_seed: (Integer) -> void
|
196
201
|
def load_session_file: (session_path: String) -> void
|
197
202
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
|
-
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
3
|
+
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
|
620
620
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
621
621
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
622
622
|
|
623
|
+
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
|
+
|
623
626
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
627
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
628
|
|
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
671
674
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
672
675
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
673
676
|
|
677
|
+
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
|
+
|
674
680
|
ifdef LLAMA_METAL
|
675
681
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
676
682
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
90
90
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
91
|
|
92
92
|
// compute graph without a plan
|
93
|
-
|
93
|
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
94
|
|
95
95
|
// check if the backend supports an operation
|
96
96
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
195
195
|
ggml_backend_synchronize(backend);
|
196
196
|
}
|
197
197
|
|
198
|
-
|
199
|
-
backend->iface.graph_compute(backend, cgraph)
|
198
|
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
+
if (!backend->iface.graph_compute(backend, cgraph)) {
|
200
|
+
return false;
|
201
|
+
}
|
200
202
|
|
201
203
|
// TODO: optional sync
|
202
204
|
ggml_backend_synchronize(backend);
|
205
|
+
return true;
|
203
206
|
}
|
204
207
|
|
205
208
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
597
600
|
GGML_UNUSED(backend);
|
598
601
|
}
|
599
602
|
|
600
|
-
static
|
603
|
+
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
601
604
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
602
605
|
|
603
606
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
611
614
|
cplan.work_data = cpu_ctx->work_data;
|
612
615
|
|
613
616
|
ggml_graph_compute(cgraph, &cplan);
|
617
|
+
return true;
|
614
618
|
}
|
615
619
|
|
616
620
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
58
58
|
|
59
59
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
60
60
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
61
|
-
GGML_API
|
61
|
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
62
62
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
63
63
|
|
64
64
|
// tensor copy between different backends
|