llama_cpp 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +697 -130
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +548 -497
- data/ext/llama_cpp/src/ggml-metal.metal +425 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -32
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +1904 -303
- data/ext/llama_cpp/src/ggml.h +126 -2
- data/ext/llama_cpp/src/llama.cpp +212 -108
- data/ext/llama_cpp/src/llama.h +12 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
|
4
|
+
data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
|
7
|
+
data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-4de0334 to master-a09f919.
|
4
|
+
- Add `low_vram` parameter to ContextParams.
|
5
|
+
- Add `vocab` method to Context.
|
6
|
+
- Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
|
7
|
+
|
1
8
|
## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
|
2
9
|
|
3
10
|
- Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
|
data/examples/README.md
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# llama_cpp.rb/examples
|
2
|
+
|
3
|
+
## chat.rb
|
4
|
+
|
5
|
+
### Usage
|
6
|
+
|
7
|
+
```sh
|
8
|
+
$ cd examples
|
9
|
+
$ gem install llama_cpp thor
|
10
|
+
$ ./chat.rb -m /path/to/quantized-model.bin -t 4
|
11
|
+
...
|
12
|
+
User: Please tell me the largest city in Japan.
|
13
|
+
Bob: Sure. The largest city in Japan is Tokyo.
|
14
|
+
User:
|
15
|
+
```
|
16
|
+
|
17
|
+
### Options
|
18
|
+
|
19
|
+
```sh
|
20
|
+
$ ./chat.rb help main
|
21
|
+
Usage:
|
22
|
+
chat.rb main -m, --model=MODEL
|
23
|
+
|
24
|
+
Options:
|
25
|
+
-s, [--seed=N] # random seed
|
26
|
+
# Default: -1
|
27
|
+
-t, [--threads=N] # number of threads
|
28
|
+
# Default: 2
|
29
|
+
-m, --model=MODEL # path to model file
|
30
|
+
-f, [--file=FILE] # prompt file to start generation
|
31
|
+
-r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
|
32
|
+
-b, [--batch-size=N] # batch size for prompt processing
|
33
|
+
# Default: 1024
|
34
|
+
-n, [--n-predict=N] # number of tokens to predict
|
35
|
+
# Default: 256
|
36
|
+
[--keep=N] # number of tokens to keep from the initial prompt
|
37
|
+
# Default: 48
|
38
|
+
[--repeat-last-n=N] # last n tokens to consider for penalize
|
39
|
+
# Default: 64
|
40
|
+
[--repeat-penalty=N] # penalize repeat sequence of tokens
|
41
|
+
# Default: 1.0
|
42
|
+
[--presence-penalty=N] # repeat alpha presence penalty
|
43
|
+
# Default: 0.0
|
44
|
+
[--frequency-penalty=N] # repeat alpha frequency penalty
|
45
|
+
# Default: 0.0
|
46
|
+
[--top-k=N] # top k sampling
|
47
|
+
# Default: 40
|
48
|
+
[--top-p=N] # top p sampling
|
49
|
+
# Default: 0.95
|
50
|
+
[--tfs-z=N] # tail free sampling, parameter z
|
51
|
+
# Default: 1.0
|
52
|
+
[--typical-p=N] # locally typical sampling, parameter p
|
53
|
+
# Default: 1.0
|
54
|
+
[--temp=N] # temperature
|
55
|
+
# Default: 0.8
|
56
|
+
[--n-gpu-layers=N] # number of layers on GPU
|
57
|
+
# Default: 0
|
58
|
+
|
59
|
+
Start chat
|
60
|
+
```
|
data/examples/chat.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# chat.rb is a simple chatbot that uses llama_cpp to generate text.
|
5
|
+
# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
|
6
|
+
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
|
7
|
+
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
|
8
|
+
|
9
|
+
require 'llama_cpp'
|
10
|
+
require 'thor'
|
11
|
+
require 'readline'
|
12
|
+
|
13
|
+
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
14
|
+
default_command :main
|
15
|
+
desc 'main', 'Start chat'
|
16
|
+
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
17
|
+
option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
|
18
|
+
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
|
+
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
|
+
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
21
|
+
option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
|
22
|
+
option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
|
23
|
+
option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
|
24
|
+
option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
|
25
|
+
option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
|
26
|
+
option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
|
27
|
+
option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
|
28
|
+
option :top_k, type: :numeric, desc: 'top k sampling', default: 40
|
29
|
+
option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
|
30
|
+
option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
|
31
|
+
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
32
|
+
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
|
+
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
+
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
35
|
+
params = LLaMACpp::ContextParams.new
|
36
|
+
params.seed = options[:seed]
|
37
|
+
params.n_gpu_layers = options[:n_gpu_layers]
|
38
|
+
context = LLaMACpp::Context.new(model_path: options[:model], params: params)
|
39
|
+
|
40
|
+
antiprompt = options[:reverse_prompt] || 'User:'
|
41
|
+
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
42
|
+
|
43
|
+
embd_input = context.tokenize(text: start_prompt, add_bos: true)
|
44
|
+
|
45
|
+
n_ctx = context.n_ctx
|
46
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
47
|
+
|
48
|
+
n_keep = options[:keep]
|
49
|
+
n_keep = embd_input.size if n_keep > embd_input.size
|
50
|
+
|
51
|
+
token_newline = context.tokenize(text: "\n", add_bos: false)
|
52
|
+
|
53
|
+
last_n_tokens = [0] * n_ctx
|
54
|
+
interactive = true
|
55
|
+
is_interacting = false
|
56
|
+
input_echo = true
|
57
|
+
first_input = true
|
58
|
+
embd = []
|
59
|
+
n_consumed = 0
|
60
|
+
n_past = 0
|
61
|
+
n_remain = options[:n_predict]
|
62
|
+
n_vocab = context.n_vocab
|
63
|
+
|
64
|
+
while interactive
|
65
|
+
unless embd.empty?
|
66
|
+
if n_past + embd.size > n_ctx
|
67
|
+
n_left = n_past - n_keep
|
68
|
+
n_past = [1, n_keep].max
|
69
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
70
|
+
end
|
71
|
+
|
72
|
+
0.step(embd.size - 1, options[:batch_size]) do |i|
|
73
|
+
n_eval = [options[:batch_size], embd.size - i].min
|
74
|
+
context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
|
75
|
+
n_past += n_eval
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
embd.clear
|
80
|
+
|
81
|
+
if embd_input.size <= n_consumed && !is_interacting
|
82
|
+
logits = context.logits
|
83
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
84
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
85
|
+
|
86
|
+
last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
|
87
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
|
88
|
+
context.sample_frequency_and_presence_penalties(
|
89
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
90
|
+
frequency: options[:frequency_penalty], presence: options[:presence_penalty]
|
91
|
+
)
|
92
|
+
|
93
|
+
context.sample_top_k(candidates, k: options[:top_k])
|
94
|
+
context.sample_tail_free(candidates, z: options[:tfs_z])
|
95
|
+
context.sample_typical(candidates, prob: options[:typical_p])
|
96
|
+
context.sample_top_p(candidates, prob: options[:top_p])
|
97
|
+
context.sample_temperature(candidates, temperature: options[:temp])
|
98
|
+
id = context.sample_token(candidates)
|
99
|
+
|
100
|
+
last_n_tokens.shift
|
101
|
+
last_n_tokens.push(id)
|
102
|
+
|
103
|
+
if id == LLaMACpp.token_eos
|
104
|
+
id = token_newline.first
|
105
|
+
unless antiprompt.empty?
|
106
|
+
first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
|
107
|
+
embd_input.concat(first_antiprompt)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
embd.push(id)
|
112
|
+
input_echo = true
|
113
|
+
n_remain -= 1
|
114
|
+
else
|
115
|
+
while embd_input.size > n_consumed
|
116
|
+
embd.push(embd_input[n_consumed])
|
117
|
+
last_n_tokens.shift
|
118
|
+
last_n_tokens.push(embd_input[n_consumed])
|
119
|
+
n_consumed += 1
|
120
|
+
break if embd.size >= options[:batch_size]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
if input_echo
|
125
|
+
output = []
|
126
|
+
embd.each { |token| output << context.token_to_str(token) }
|
127
|
+
output_str = output.join
|
128
|
+
output_str.chomp!(antiprompt) if first_input
|
129
|
+
print(output_str)
|
130
|
+
end
|
131
|
+
|
132
|
+
if embd_input.size <= n_consumed
|
133
|
+
if antiprompt.size.positive?
|
134
|
+
last_output = []
|
135
|
+
last_n_tokens.each { |token| last_output << context.token_to_str(token) }
|
136
|
+
last_output_str = last_output.join
|
137
|
+
|
138
|
+
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
139
|
+
unless last_output_str.index(antiprompt, search_start_pos).nil?
|
140
|
+
is_interacting = true
|
141
|
+
true
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if n_past.positive? && is_interacting
|
146
|
+
if first_input
|
147
|
+
print("\r#{antiprompt}")
|
148
|
+
first_input = false
|
149
|
+
end
|
150
|
+
buffer = Readline.readline(' ')
|
151
|
+
break interactive = false if buffer.nil?
|
152
|
+
|
153
|
+
if buffer.size > 1
|
154
|
+
line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
|
155
|
+
embd_input.concat(line_input)
|
156
|
+
n_remain -= line_input.size
|
157
|
+
end
|
158
|
+
|
159
|
+
input_echo = false
|
160
|
+
end
|
161
|
+
|
162
|
+
is_interacting = false if n_past.positive?
|
163
|
+
end
|
164
|
+
|
165
|
+
if n_remain <= 0 && options[:n_predict] != -1
|
166
|
+
n_remain = options[:n_predict]
|
167
|
+
is_interacting = true
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def read_prompt(filename)
|
175
|
+
return if filename.nil?
|
176
|
+
|
177
|
+
File.read(filename).chomp
|
178
|
+
end
|
179
|
+
|
180
|
+
def default_prompt(antiprompt)
|
181
|
+
# Reference:
|
182
|
+
# https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
|
183
|
+
prompt = <<~MSG
|
184
|
+
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
185
|
+
|
186
|
+
User: Hello, Bob.
|
187
|
+
Bob: Hello. How may I help you today?
|
188
|
+
User: Please tell me the largest city in Europe.
|
189
|
+
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
190
|
+
MSG
|
191
|
+
prompt + antiprompt
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
Chat.start(ARGV)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -300,6 +300,8 @@ public:
|
|
300
300
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
|
301
301
|
rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
|
302
302
|
rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
|
303
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
|
304
|
+
rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
|
303
305
|
rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
|
304
306
|
rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
|
305
307
|
rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
|
@@ -386,6 +388,18 @@ private:
|
|
386
388
|
return ret;
|
387
389
|
};
|
388
390
|
|
391
|
+
// low_vram
|
392
|
+
static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
|
393
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
394
|
+
ptr->params.low_vram = low_vram == Qtrue ? true : false;
|
395
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
396
|
+
};
|
397
|
+
|
398
|
+
static VALUE _llama_context_params_get_low_vram(VALUE self) {
|
399
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
400
|
+
return ptr->params.low_vram ? Qtrue : Qfalse;
|
401
|
+
};
|
402
|
+
|
389
403
|
// seed
|
390
404
|
static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
|
391
405
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -641,6 +655,7 @@ public:
|
|
641
655
|
rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
|
642
656
|
rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
|
643
657
|
rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
|
658
|
+
rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
|
644
659
|
rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
|
645
660
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
646
661
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
@@ -896,6 +911,43 @@ private:
|
|
896
911
|
return output;
|
897
912
|
};
|
898
913
|
|
914
|
+
static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
|
915
|
+
VALUE kw_args = Qnil;
|
916
|
+
ID kw_table[1] = { rb_intern("capacity") };
|
917
|
+
VALUE kw_values[1] = { Qundef };
|
918
|
+
rb_scan_args(argc, argv, ":", &kw_args);
|
919
|
+
rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
|
920
|
+
|
921
|
+
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
922
|
+
rb_raise(rb_eArgError, "capacity must be an integer");
|
923
|
+
return Qnil;
|
924
|
+
}
|
925
|
+
|
926
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
927
|
+
if (ptr->ctx == NULL) {
|
928
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
929
|
+
return Qnil;
|
930
|
+
}
|
931
|
+
|
932
|
+
const int capacity = NUM2INT(kw_values[0]);
|
933
|
+
std::vector<const char*> strings;
|
934
|
+
std::vector<float> scores;
|
935
|
+
int n_vocab = llama_n_vocab(ptr->ctx);
|
936
|
+
strings.resize(n_vocab, NULL);
|
937
|
+
scores.resize(n_vocab, 0);
|
938
|
+
|
939
|
+
n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
|
940
|
+
|
941
|
+
VALUE ret_strings = rb_ary_new();
|
942
|
+
VALUE ret_scores = rb_ary_new();
|
943
|
+
for (int i = 0; i < n_vocab; i++) {
|
944
|
+
rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
|
945
|
+
rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
|
946
|
+
}
|
947
|
+
|
948
|
+
return rb_ary_new_from_args(2, ret_strings, ret_scores);
|
949
|
+
};
|
950
|
+
|
899
951
|
static VALUE _llama_context_n_vocab(VALUE self) {
|
900
952
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
901
953
|
if (ptr->ctx == NULL) {
|