llama_cpp 0.11.1 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +3 -3
- data/examples/chat.rb +6 -2
- data/examples/embedding.rb +5 -1
- data/examples/simple.rb +4 -1
- data/ext/llama_cpp/llama_cpp.cpp +63 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +5 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13381408318e71cc1fc55c40ee9be6e62ad9e3ad6a8ce39279bb8040614e9b3b
|
4
|
+
data.tar.gz: 6456734b18865a7811f08d0d9d599771f574f4b59bd5b54a964ece7428115907
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1014349771d7aa3c318027de11603e96d5482e4bd5b1bcf0fd4874040245daf44c4cfb801077a698846459a7619ca9e01e0afc3507fc7bd519e7ba68a000a15d
|
7
|
+
data.tar.gz: 1315ca8954397edb0db93347a10762e35f829377ef3dba0ea9cf6c67f986972ac8e75b46c410a3ceceefc0474f2abbe6f441e56a60e789ef1d2617fc15cfb29e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
## [[0.12.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.0...v0.12.1)] - 2024-01-13
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1768 to b1833.
|
4
|
+
- Add model file type constants.
|
5
|
+
- Add `kv_cache_seq_div` method to `Context`.
|
6
|
+
|
7
|
+
## [[0.12.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.1...v0.12.0)] - 2024-01-11
|
8
|
+
|
9
|
+
- Add `get_one` singleton method to `Batch`.
|
10
|
+
|
11
|
+
**Breaking Changes**
|
12
|
+
|
13
|
+
- Add deprecation warning to `eval`, `eval_embd`, and `sample_temperature` methods on `Context`.
|
14
|
+
- Change to avoid using deprecated methods on `generate` method and example scripts.
|
15
|
+
|
1
16
|
## [[0.11.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.11.0...v0.11.1)] - 2024-01-08
|
2
17
|
|
3
18
|
- Fix to set the values of `@n_tokens` and `@has_evaluated` instance variables in `decode` method of `Context`.
|
data/README.md
CHANGED
@@ -22,14 +22,14 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
22
22
|
$ gem install llama_cpp
|
23
23
|
```
|
24
24
|
|
25
|
-
There are several installation options
|
25
|
+
There are several installation options:
|
26
26
|
|
27
27
|
```sh
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use CUDA
|
32
|
+
$ gem install llama_cpp -- --with-cuda
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/examples/chat.rb
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
require 'llama_cpp'
|
10
10
|
require 'thor'
|
11
11
|
require 'readline'
|
12
|
+
require 'etc'
|
12
13
|
|
13
14
|
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
14
15
|
default_command :main
|
@@ -30,12 +31,15 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
30
31
|
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
31
32
|
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
32
33
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
33
35
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
34
36
|
mdl_params = LLaMACpp::ModelParams.new
|
35
37
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
36
38
|
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
37
39
|
ctx_params = LLaMACpp::ContextParams.new
|
38
40
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
+
ctx_params.n_threads = options[:n_threads]
|
42
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
39
43
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
40
44
|
|
41
45
|
antiprompt = options[:reverse_prompt] || 'User:'
|
@@ -70,7 +74,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
70
74
|
|
71
75
|
0.step(embd.size - 1, options[:batch_size]) do |i|
|
72
76
|
n_eval = [options[:batch_size], embd.size - i].min
|
73
|
-
context.
|
77
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
74
78
|
n_past += n_eval
|
75
79
|
end
|
76
80
|
end
|
@@ -95,7 +99,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
95
99
|
context.sample_tail_free(candidates, z: options[:tfs_z])
|
96
100
|
context.sample_typical(candidates, prob: options[:typical_p])
|
97
101
|
context.sample_top_p(candidates, prob: options[:top_p])
|
98
|
-
context.
|
102
|
+
context.sample_temp(candidates, temp: options[:temp])
|
99
103
|
id = context.sample_token(candidates)
|
100
104
|
|
101
105
|
last_n_tokens.shift
|
data/examples/embedding.rb
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Embedding < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
@@ -15,6 +16,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
15
16
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
16
17
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
17
18
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
18
20
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
19
21
|
mdl_params = LLaMACpp::ModelParams.new
|
20
22
|
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
@@ -22,13 +24,15 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
22
24
|
ctx_params = LLaMACpp::ContextParams.new
|
23
25
|
ctx_params.embedding = true
|
24
26
|
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
+
ctx_params.n_threads = options[:n_threads]
|
28
|
+
ctx_params.n_threads_batch = options[:n_threads]
|
25
29
|
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
26
30
|
|
27
31
|
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
28
32
|
|
29
33
|
return unless embd_input.size.positive?
|
30
34
|
|
31
|
-
context.
|
35
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
32
36
|
|
33
37
|
context.embeddings.each { |val| print("#{val} ") }
|
34
38
|
print("\n")
|
data/examples/simple.rb
CHANGED
@@ -7,12 +7,14 @@
|
|
7
7
|
|
8
8
|
require 'llama_cpp'
|
9
9
|
require 'thor'
|
10
|
+
require 'etc'
|
10
11
|
|
11
12
|
class Simple < Thor # rubocop:disable Style/Documentation
|
12
13
|
default_command :main
|
13
14
|
desc 'main', 'Simple completion'
|
14
15
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
15
16
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
|
17
|
+
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
16
18
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
19
|
n_len = 32
|
18
20
|
model_params = LLaMACpp::ModelParams.new
|
@@ -21,7 +23,8 @@ class Simple < Thor # rubocop:disable Style/Documentation
|
|
21
23
|
context_params.seed = 1234
|
22
24
|
context_params.n_ctx = 2048
|
23
25
|
context_params.logits_all = true
|
24
|
-
context_params.n_threads =
|
26
|
+
context_params.n_threads = options[:n_threads]
|
27
|
+
context_params.n_threads_batch = options[:n_threads]
|
25
28
|
context = LLaMACpp::Context.new(model: model, params: context_params)
|
26
29
|
|
27
30
|
tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -51,6 +51,7 @@ public:
|
|
51
51
|
static void define_class(VALUE outer) {
|
52
52
|
rb_cLLaMABatch = rb_define_class_under(outer, "Batch", rb_cObject);
|
53
53
|
rb_define_alloc_func(rb_cLLaMABatch, llama_batch_alloc);
|
54
|
+
rb_define_singleton_method(rb_cLLaMABatch, "get_one", RUBY_METHOD_FUNC(_llama_batch_get_one), -1);
|
54
55
|
rb_define_method(rb_cLLaMABatch, "initialize", RUBY_METHOD_FUNC(_llama_batch_initialize), -1);
|
55
56
|
rb_define_method(rb_cLLaMABatch, "n_tokens=", RUBY_METHOD_FUNC(_llama_batch_set_n_tokens), 1);
|
56
57
|
rb_define_method(rb_cLLaMABatch, "n_tokens", RUBY_METHOD_FUNC(_llama_batch_get_n_tokens), 0);
|
@@ -75,6 +76,48 @@ public:
|
|
75
76
|
private:
|
76
77
|
static const rb_data_type_t llama_batch_type;
|
77
78
|
|
79
|
+
static VALUE _llama_batch_get_one(int argc, VALUE* argv, VALUE klass) {
|
80
|
+
VALUE kw_args = Qnil;
|
81
|
+
ID kw_table[4] = { rb_intern("tokens"), rb_intern("n_tokens"), rb_intern("pos_zero"), rb_intern("seq_id") };
|
82
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
83
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
84
|
+
rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
|
85
|
+
|
86
|
+
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
87
|
+
rb_raise(rb_eArgError, "tokens must be an array");
|
88
|
+
return Qnil;
|
89
|
+
}
|
90
|
+
if (!RB_INTEGER_TYPE_P(kw_values[1])) {
|
91
|
+
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
92
|
+
return Qnil;
|
93
|
+
}
|
94
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
95
|
+
rb_raise(rb_eArgError, "pos_zero must be an integer");
|
96
|
+
return Qnil;
|
97
|
+
}
|
98
|
+
if (!RB_INTEGER_TYPE_P(kw_values[3])) {
|
99
|
+
rb_raise(rb_eArgError, "seq_id must be an integer");
|
100
|
+
return Qnil;
|
101
|
+
}
|
102
|
+
|
103
|
+
const size_t sz_array = RARRAY_LEN(kw_values[0]);
|
104
|
+
const int32_t n_tokens = NUM2INT(kw_values[1]);
|
105
|
+
const llama_pos pos_zero = NUM2INT(kw_values[2]);
|
106
|
+
const llama_seq_id seq_id = NUM2INT(kw_values[3]);
|
107
|
+
|
108
|
+
LLaMABatchWrapper* ptr = (LLaMABatchWrapper*)ruby_xmalloc(sizeof(LLaMABatchWrapper));
|
109
|
+
new (ptr) LLaMABatchWrapper();
|
110
|
+
ptr->batch = llama_batch_get_one(nullptr, n_tokens, pos_zero, seq_id);
|
111
|
+
|
112
|
+
ptr->batch.token = (llama_token*)malloc(sizeof(llama_token) * sz_array);
|
113
|
+
for (size_t i = 0; i < sz_array; i++) {
|
114
|
+
VALUE el = rb_ary_entry(kw_values[0], i);
|
115
|
+
ptr->batch.token[i] = NUM2INT(el);
|
116
|
+
}
|
117
|
+
|
118
|
+
return TypedData_Wrap_Struct(klass, &llama_batch_type, ptr);
|
119
|
+
}
|
120
|
+
|
78
121
|
static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
|
79
122
|
VALUE kw_args = Qnil;
|
80
123
|
ID kw_table[3] = { rb_intern("max_n_token"), rb_intern("n_embd"), rb_intern("max_n_seq") };
|
@@ -1983,6 +2026,7 @@ public:
|
|
1983
2026
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_cp", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_cp), 4);
|
1984
2027
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_keep", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_keep), 1);
|
1985
2028
|
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_shift", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_shift), 4);
|
2029
|
+
rb_define_method(rb_cLLaMAContext, "kv_cache_seq_div", RUBY_METHOD_FUNC(_llama_context_kv_cache_seq_div), 4);
|
1986
2030
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
1987
2031
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
1988
2032
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
@@ -2054,6 +2098,8 @@ private:
|
|
2054
2098
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2055
2099
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2056
2100
|
|
2101
|
+
rb_warn("eval is deprecated. Use decode instead.");
|
2102
|
+
|
2057
2103
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2058
2104
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2059
2105
|
return Qnil;
|
@@ -2104,6 +2150,8 @@ private:
|
|
2104
2150
|
rb_scan_args(argc, argv, ":", &kw_args);
|
2105
2151
|
rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
|
2106
2152
|
|
2153
|
+
rb_warn("eval_embd is deprecated. Use decode instead.");
|
2154
|
+
|
2107
2155
|
if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
|
2108
2156
|
rb_raise(rb_eArgError, "tokens must be an Array");
|
2109
2157
|
return Qnil;
|
@@ -2331,6 +2379,16 @@ private:
|
|
2331
2379
|
return Qnil;
|
2332
2380
|
}
|
2333
2381
|
|
2382
|
+
static VALUE _llama_context_kv_cache_seq_div(VALUE self, VALUE seq_id, VALUE p0, VALUE p1, VALUE d) {
|
2383
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2384
|
+
if (ptr->ctx == NULL) {
|
2385
|
+
rb_raise(rb_eArgError, "LLaMA context is not initialized");
|
2386
|
+
return Qnil;
|
2387
|
+
}
|
2388
|
+
llama_kv_cache_seq_div(ptr->ctx, NUM2INT(seq_id), NUM2INT(p0), NUM2INT(p1), NUM2INT(d));
|
2389
|
+
return Qnil;
|
2390
|
+
}
|
2391
|
+
|
2334
2392
|
static VALUE _llama_context_set_rng_seed(VALUE self, VALUE seed_) {
|
2335
2393
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2336
2394
|
if (ptr->ctx == NULL) {
|
@@ -2794,6 +2852,8 @@ private:
|
|
2794
2852
|
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2795
2853
|
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
2796
2854
|
|
2855
|
+
rb_warn("sample_temperature is deprecated. Use sample_temp instead.");
|
2856
|
+
|
2797
2857
|
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2798
2858
|
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
2799
2859
|
return Qnil;
|
@@ -3160,6 +3220,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3160
3220
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_S));
|
3161
3221
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
|
3162
3222
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
|
3223
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3224
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3225
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3163
3226
|
|
3164
3227
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3165
3228
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.12.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1833'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -54,7 +54,7 @@ module LLaMACpp
|
|
54
54
|
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
55
|
end
|
56
56
|
|
57
|
-
context.
|
57
|
+
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
58
|
end
|
59
59
|
|
60
60
|
n_past += embd.size
|
@@ -77,7 +77,7 @@ module LLaMACpp
|
|
77
77
|
context.sample_tail_free(candidates, z: tfs_z)
|
78
78
|
context.sample_typical(candidates, prob: typical_p)
|
79
79
|
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.
|
80
|
+
context.sample_temp(candidates, temp: temperature)
|
81
81
|
id = context.sample_token(candidates)
|
82
82
|
|
83
83
|
last_n_tokens.shift
|
data/sig/llama_cpp.rbs
CHANGED
@@ -22,6 +22,9 @@ module LLaMACpp
|
|
22
22
|
LLAMA_FTYPE_MOSTLY_Q5_K_S: Integer
|
23
23
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
24
24
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
25
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
25
28
|
|
26
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
27
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -149,6 +152,7 @@ module LLaMACpp
|
|
149
152
|
class Batch
|
150
153
|
public
|
151
154
|
|
155
|
+
def self.get_one: (tokens: Array[Integer], n_tokens: Integer, pos_zero: Integer, seq_id: Integer) -> ::LLaMACpp::Batch
|
152
156
|
def initialize: (max_n_token: Integer, n_embd: Integer, max_n_seq: Integer) -> void
|
153
157
|
def n_tokens=: (Integer) -> Integer
|
154
158
|
def n_tokens: () -> Integer
|
@@ -192,6 +196,7 @@ module LLaMACpp
|
|
192
196
|
def kv_cache_seq_cp: (Integer, Integer,Integer, Integer) -> void
|
193
197
|
def kv_cache_seq_keep: (Integer) -> void
|
194
198
|
def kv_cache_seq_shift: (Integer, Integer, Integer, Integer) -> void
|
199
|
+
def kv_cache_seq_div: (Integer, Integer, Integer, Integer) -> void
|
195
200
|
def set_rng_seed: (Integer) -> void
|
196
201
|
def load_session_file: (session_path: String) -> void
|
197
202
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Define the default target now so that it is always the first target
|
2
2
|
BUILD_TARGETS = \
|
3
|
-
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
3
|
+
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -620,6 +620,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.
|
|
620
620
|
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
621
621
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
622
622
|
|
623
|
+
imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
|
+
|
623
626
|
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
624
627
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
625
628
|
|
@@ -671,6 +674,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
|
671
674
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
672
675
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
673
676
|
|
677
|
+
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
678
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
679
|
+
|
674
680
|
ifdef LLAMA_METAL
|
675
681
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
676
682
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
90
90
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
91
91
|
|
92
92
|
// compute graph without a plan
|
93
|
-
|
93
|
+
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
94
94
|
|
95
95
|
// check if the backend supports an operation
|
96
96
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
@@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|
195
195
|
ggml_backend_synchronize(backend);
|
196
196
|
}
|
197
197
|
|
198
|
-
|
199
|
-
backend->iface.graph_compute(backend, cgraph)
|
198
|
+
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
199
|
+
if (!backend->iface.graph_compute(backend, cgraph)) {
|
200
|
+
return false;
|
201
|
+
}
|
200
202
|
|
201
203
|
// TODO: optional sync
|
202
204
|
ggml_backend_synchronize(backend);
|
205
|
+
return true;
|
203
206
|
}
|
204
207
|
|
205
208
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|
597
600
|
GGML_UNUSED(backend);
|
598
601
|
}
|
599
602
|
|
600
|
-
static
|
603
|
+
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
601
604
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
602
605
|
|
603
606
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
@@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|
611
614
|
cplan.work_data = cpu_ctx->work_data;
|
612
615
|
|
613
616
|
ggml_graph_compute(cgraph, &cplan);
|
617
|
+
return true;
|
614
618
|
}
|
615
619
|
|
616
620
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
@@ -58,7 +58,7 @@ extern "C" {
|
|
58
58
|
|
59
59
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
60
60
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
61
|
-
GGML_API
|
61
|
+
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
62
62
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
63
63
|
|
64
64
|
// tensor copy between different backends
|