llama_cpp 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
4
- data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
3
+ metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
4
+ data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
5
5
  SHA512:
6
- metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
7
- data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
6
+ metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
7
+ data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
2
+
3
+ - Bump bundled llama.cpp from master-4de0334 to master-a09f919.
4
+ - Add `low_vram` parameter to ContextParams.
5
+ - Add `vocab` method to Context.
6
+ - Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
7
+
1
8
  ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
2
9
 
3
10
  - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
@@ -0,0 +1,60 @@
1
+ # llama_cpp.rb/examples
2
+
3
+ ## chat.rb
4
+
5
+ ### Usage
6
+
7
+ ```sh
8
+ $ cd examples
9
+ $ gem install llama_cpp thor
10
+ $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
+ ...
12
+ User: Please tell me the largest city in Japan.
13
+ Bob: Sure. The largest city in Japan is Tokyo.
14
+ User:
15
+ ```
16
+
17
+ ### Options
18
+
19
+ ```sh
20
+ $ ./chat.rb help main
21
+ Usage:
22
+ chat.rb main -m, --model=MODEL
23
+
24
+ Options:
25
+ -s, [--seed=N] # random seed
26
+ # Default: -1
27
+ -t, [--threads=N] # number of threads
28
+ # Default: 2
29
+ -m, --model=MODEL # path to model file
30
+ -f, [--file=FILE] # prompt file to start generation
31
+ -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
+ -b, [--batch-size=N] # batch size for prompt processing
33
+ # Default: 1024
34
+ -n, [--n-predict=N] # number of tokens to predict
35
+ # Default: 256
36
+ [--keep=N] # number of tokens to keep from the initial prompt
37
+ # Default: 48
38
+ [--repeat-last-n=N] # last n tokens to consider for penalize
39
+ # Default: 64
40
+ [--repeat-penalty=N] # penalize repeat sequence of tokens
41
+ # Default: 1.0
42
+ [--presence-penalty=N] # repeat alpha presence penalty
43
+ # Default: 0.0
44
+ [--frequency-penalty=N] # repeat alpha frequency penalty
45
+ # Default: 0.0
46
+ [--top-k=N] # top k sampling
47
+ # Default: 40
48
+ [--top-p=N] # top p sampling
49
+ # Default: 0.95
50
+ [--tfs-z=N] # tail free sampling, parameter z
51
+ # Default: 1.0
52
+ [--typical-p=N] # locally typical sampling, parameter p
53
+ # Default: 1.0
54
+ [--temp=N] # temperature
55
+ # Default: 0.8
56
+ [--n-gpu-layers=N] # number of layers on GPU
57
+ # Default: 0
58
+
59
+ Start chat
60
+ ```
data/examples/chat.rb ADDED
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
+ # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
+
9
+ require 'llama_cpp'
10
+ require 'thor'
11
+ require 'readline'
12
+
13
+ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
+ default_command :main
15
+ desc 'main', 'Start chat'
16
+ option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
17
+ option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
18
+ option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
+ option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
+ option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
+ option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
+ option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
+ option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
+ option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
+ option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
+ option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
+ option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
+ option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
+ option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
+ option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
+ option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
+ option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
+ option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
+ params = LLaMACpp::ContextParams.new
36
+ params.seed = options[:seed]
37
+ params.n_gpu_layers = options[:n_gpu_layers]
38
+ context = LLaMACpp::Context.new(model_path: options[:model], params: params)
39
+
40
+ antiprompt = options[:reverse_prompt] || 'User:'
41
+ start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
42
+
43
+ embd_input = context.tokenize(text: start_prompt, add_bos: true)
44
+
45
+ n_ctx = context.n_ctx
46
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
47
+
48
+ n_keep = options[:keep]
49
+ n_keep = embd_input.size if n_keep > embd_input.size
50
+
51
+ token_newline = context.tokenize(text: "\n", add_bos: false)
52
+
53
+ last_n_tokens = [0] * n_ctx
54
+ interactive = true
55
+ is_interacting = false
56
+ input_echo = true
57
+ first_input = true
58
+ embd = []
59
+ n_consumed = 0
60
+ n_past = 0
61
+ n_remain = options[:n_predict]
62
+ n_vocab = context.n_vocab
63
+
64
+ while interactive
65
+ unless embd.empty?
66
+ if n_past + embd.size > n_ctx
67
+ n_left = n_past - n_keep
68
+ n_past = [1, n_keep].max
69
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
70
+ end
71
+
72
+ 0.step(embd.size - 1, options[:batch_size]) do |i|
73
+ n_eval = [options[:batch_size], embd.size - i].min
74
+ context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
75
+ n_past += n_eval
76
+ end
77
+ end
78
+
79
+ embd.clear
80
+
81
+ if embd_input.size <= n_consumed && !is_interacting
82
+ logits = context.logits
83
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
84
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
85
+
86
+ last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
87
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
88
+ context.sample_frequency_and_presence_penalties(
89
+ candidates, last_n_tokens[-last_n_repeat..],
90
+ frequency: options[:frequency_penalty], presence: options[:presence_penalty]
91
+ )
92
+
93
+ context.sample_top_k(candidates, k: options[:top_k])
94
+ context.sample_tail_free(candidates, z: options[:tfs_z])
95
+ context.sample_typical(candidates, prob: options[:typical_p])
96
+ context.sample_top_p(candidates, prob: options[:top_p])
97
+ context.sample_temperature(candidates, temperature: options[:temp])
98
+ id = context.sample_token(candidates)
99
+
100
+ last_n_tokens.shift
101
+ last_n_tokens.push(id)
102
+
103
+ if id == LLaMACpp.token_eos
104
+ id = token_newline.first
105
+ unless antiprompt.empty?
106
+ first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
107
+ embd_input.concat(first_antiprompt)
108
+ end
109
+ end
110
+
111
+ embd.push(id)
112
+ input_echo = true
113
+ n_remain -= 1
114
+ else
115
+ while embd_input.size > n_consumed
116
+ embd.push(embd_input[n_consumed])
117
+ last_n_tokens.shift
118
+ last_n_tokens.push(embd_input[n_consumed])
119
+ n_consumed += 1
120
+ break if embd.size >= options[:batch_size]
121
+ end
122
+ end
123
+
124
+ if input_echo
125
+ output = []
126
+ embd.each { |token| output << context.token_to_str(token) }
127
+ output_str = output.join
128
+ output_str.chomp!(antiprompt) if first_input
129
+ print(output_str)
130
+ end
131
+
132
+ if embd_input.size <= n_consumed
133
+ if antiprompt.size.positive?
134
+ last_output = []
135
+ last_n_tokens.each { |token| last_output << context.token_to_str(token) }
136
+ last_output_str = last_output.join
137
+
138
+ search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
139
+ unless last_output_str.index(antiprompt, search_start_pos).nil?
140
+ is_interacting = true
141
+ true
142
+ end
143
+ end
144
+
145
+ if n_past.positive? && is_interacting
146
+ if first_input
147
+ print("\r#{antiprompt}")
148
+ first_input = false
149
+ end
150
+ buffer = Readline.readline(' ')
151
+ break interactive = false if buffer.nil?
152
+
153
+ if buffer.size > 1
154
+ line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
155
+ embd_input.concat(line_input)
156
+ n_remain -= line_input.size
157
+ end
158
+
159
+ input_echo = false
160
+ end
161
+
162
+ is_interacting = false if n_past.positive?
163
+ end
164
+
165
+ if n_remain <= 0 && options[:n_predict] != -1
166
+ n_remain = options[:n_predict]
167
+ is_interacting = true
168
+ end
169
+ end
170
+ end
171
+
172
+ private
173
+
174
+ def read_prompt(filename)
175
+ return if filename.nil?
176
+
177
+ File.read(filename).chomp
178
+ end
179
+
180
+ def default_prompt(antiprompt)
181
+ # Reference:
182
+ # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
183
+ prompt = <<~MSG
184
+ Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
185
+
186
+ User: Hello, Bob.
187
+ Bob: Hello. How may I help you today?
188
+ User: Please tell me the largest city in Europe.
189
+ Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
190
+ MSG
191
+ prompt + antiprompt
192
+ end
193
+ end
194
+
195
+ Chat.start(ARGV)
@@ -300,6 +300,8 @@ public:
300
300
  rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
301
  rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
302
  rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
303
+ rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
304
+ rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
303
305
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
304
306
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
305
307
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -386,6 +388,18 @@ private:
386
388
  return ret;
387
389
  };
388
390
 
391
+ // low_vram
392
+ static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
393
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
394
+ ptr->params.low_vram = low_vram == Qtrue ? true : false;
395
+ return ptr->params.low_vram ? Qtrue : Qfalse;
396
+ };
397
+
398
+ static VALUE _llama_context_params_get_low_vram(VALUE self) {
399
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
400
+ return ptr->params.low_vram ? Qtrue : Qfalse;
401
+ };
402
+
389
403
  // seed
390
404
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
391
405
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -641,6 +655,7 @@ public:
641
655
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
642
656
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
643
657
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
658
+ rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
644
659
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
645
660
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
646
661
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -896,6 +911,43 @@ private:
896
911
  return output;
897
912
  };
898
913
 
914
+ static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
915
+ VALUE kw_args = Qnil;
916
+ ID kw_table[1] = { rb_intern("capacity") };
917
+ VALUE kw_values[1] = { Qundef };
918
+ rb_scan_args(argc, argv, ":", &kw_args);
919
+ rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
920
+
921
+ if (!RB_INTEGER_TYPE_P(kw_values[0])) {
922
+ rb_raise(rb_eArgError, "capacity must be an integer");
923
+ return Qnil;
924
+ }
925
+
926
+ LLaMAContextWrapper* ptr = get_llama_context(self);
927
+ if (ptr->ctx == NULL) {
928
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
929
+ return Qnil;
930
+ }
931
+
932
+ const int capacity = NUM2INT(kw_values[0]);
933
+ std::vector<const char*> strings;
934
+ std::vector<float> scores;
935
+ int n_vocab = llama_n_vocab(ptr->ctx);
936
+ strings.resize(n_vocab, NULL);
937
+ scores.resize(n_vocab, 0);
938
+
939
+ n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
940
+
941
+ VALUE ret_strings = rb_ary_new();
942
+ VALUE ret_scores = rb_ary_new();
943
+ for (int i = 0; i < n_vocab; i++) {
944
+ rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
945
+ rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
946
+ }
947
+
948
+ return rb_ary_new_from_args(2, ret_strings, ret_scores);
949
+ };
950
+
899
951
  static VALUE _llama_context_n_vocab(VALUE self) {
900
952
  LLaMAContextWrapper* ptr = get_llama_context(self);
901
953
  if (ptr->ctx == NULL) {