llama_cpp 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bf8532fb7d2d96acd42b0da600cd5a8923411545817cb433036c182cb6d549ca
4
- data.tar.gz: 2aa68d4ffe814632d6b8f2d97f6407284520d2b540b8920c40d486c599221bc3
3
+ metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
4
+ data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
5
5
  SHA512:
6
- metadata.gz: ea8dad06ae15f9ca6ba585ae901d163bc6580543131338bbe785444083791b6251b2f631725190f9935740d0169520e3da604a66330e5bf7551031b7dc47dd81
7
- data.tar.gz: 3b32180e6a4653af2afac59d4640c5d06b29f5872d7ef40f33dcddc27b97e2eb0fa2f38fe6389ddcb60b4631db9e3ebfd0d4b14cc9de6419e50452b4c67ad98a
6
+ metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
7
+ data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
2
+
3
+ - Bump bundled llama.cpp from master-4de0334 to master-a09f919.
4
+ - Add `low_vram` parameter to ContextParams.
5
+ - Add `vocab` method to Context.
6
+ - Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
7
+
1
8
  ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
2
9
 
3
10
  - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
@@ -0,0 +1,60 @@
1
+ # llama_cpp.rb/examples
2
+
3
+ ## chat.rb
4
+
5
+ ### Usage
6
+
7
+ ```sh
8
+ $ cd examples
9
+ $ gem install llama_cpp thor
10
+ $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
+ ...
12
+ User: Please tell me the largest city in Japan.
13
+ Bob: Sure. The largest city in Japan is Tokyo.
14
+ User:
15
+ ```
16
+
17
+ ### Options
18
+
19
+ ```sh
20
+ $ ./chat.rb help main
21
+ Usage:
22
+ chat.rb main -m, --model=MODEL
23
+
24
+ Options:
25
+ -s, [--seed=N] # random seed
26
+ # Default: -1
27
+ -t, [--threads=N] # number of threads
28
+ # Default: 2
29
+ -m, --model=MODEL # path to model file
30
+ -f, [--file=FILE] # prompt file to start generation
31
+ -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
+ -b, [--batch-size=N] # batch size for prompt processing
33
+ # Default: 1024
34
+ -n, [--n-predict=N] # number of tokens to predict
35
+ # Default: 256
36
+ [--keep=N] # number of tokens to keep from the initial prompt
37
+ # Default: 48
38
+ [--repeat-last-n=N] # last n tokens to consider for penalize
39
+ # Default: 64
40
+ [--repeat-penalty=N] # penalize repeat sequence of tokens
41
+ # Default: 1.0
42
+ [--presence-penalty=N] # repeat alpha presence penalty
43
+ # Default: 0.0
44
+ [--frequency-penalty=N] # repeat alpha frequency penalty
45
+ # Default: 0.0
46
+ [--top-k=N] # top k sampling
47
+ # Default: 40
48
+ [--top-p=N] # top p sampling
49
+ # Default: 0.95
50
+ [--tfs-z=N] # tail free sampling, parameter z
51
+ # Default: 1.0
52
+ [--typical-p=N] # locally typical sampling, parameter p
53
+ # Default: 1.0
54
+ [--temp=N] # temperature
55
+ # Default: 0.8
56
+ [--n-gpu-layers=N] # number of layers on GPU
57
+ # Default: 0
58
+
59
+ Start chat
60
+ ```
data/examples/chat.rb ADDED
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
+ # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
+
9
+ require 'llama_cpp'
10
+ require 'thor'
11
+ require 'readline'
12
+
13
+ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
+ default_command :main
15
+ desc 'main', 'Start chat'
16
+ option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
17
+ option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
18
+ option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
+ option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
+ option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
+ option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
+ option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
+ option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
+ option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
+ option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
+ option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
+ option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
+ option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
+ option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
+ option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
+ option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
+ option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
+ option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
+ params = LLaMACpp::ContextParams.new
36
+ params.seed = options[:seed]
37
+ params.n_gpu_layers = options[:n_gpu_layers]
38
+ context = LLaMACpp::Context.new(model_path: options[:model], params: params)
39
+
40
+ antiprompt = options[:reverse_prompt] || 'User:'
41
+ start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
42
+
43
+ embd_input = context.tokenize(text: start_prompt, add_bos: true)
44
+
45
+ n_ctx = context.n_ctx
46
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
47
+
48
+ n_keep = options[:keep]
49
+ n_keep = embd_input.size if n_keep > embd_input.size
50
+
51
+ token_newline = context.tokenize(text: "\n", add_bos: false)
52
+
53
+ last_n_tokens = [0] * n_ctx
54
+ interactive = true
55
+ is_interacting = false
56
+ input_echo = true
57
+ first_input = true
58
+ embd = []
59
+ n_consumed = 0
60
+ n_past = 0
61
+ n_remain = options[:n_predict]
62
+ n_vocab = context.n_vocab
63
+
64
+ while interactive
65
+ unless embd.empty?
66
+ if n_past + embd.size > n_ctx
67
+ n_left = n_past - n_keep
68
+ n_past = [1, n_keep].max
69
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
70
+ end
71
+
72
+ 0.step(embd.size - 1, options[:batch_size]) do |i|
73
+ n_eval = [options[:batch_size], embd.size - i].min
74
+ context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
75
+ n_past += n_eval
76
+ end
77
+ end
78
+
79
+ embd.clear
80
+
81
+ if embd_input.size <= n_consumed && !is_interacting
82
+ logits = context.logits
83
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
84
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
85
+
86
+ last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
87
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
88
+ context.sample_frequency_and_presence_penalties(
89
+ candidates, last_n_tokens[-last_n_repeat..],
90
+ frequency: options[:frequency_penalty], presence: options[:presence_penalty]
91
+ )
92
+
93
+ context.sample_top_k(candidates, k: options[:top_k])
94
+ context.sample_tail_free(candidates, z: options[:tfs_z])
95
+ context.sample_typical(candidates, prob: options[:typical_p])
96
+ context.sample_top_p(candidates, prob: options[:top_p])
97
+ context.sample_temperature(candidates, temperature: options[:temp])
98
+ id = context.sample_token(candidates)
99
+
100
+ last_n_tokens.shift
101
+ last_n_tokens.push(id)
102
+
103
+ if id == LLaMACpp.token_eos
104
+ id = token_newline.first
105
+ unless antiprompt.empty?
106
+ first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
107
+ embd_input.concat(first_antiprompt)
108
+ end
109
+ end
110
+
111
+ embd.push(id)
112
+ input_echo = true
113
+ n_remain -= 1
114
+ else
115
+ while embd_input.size > n_consumed
116
+ embd.push(embd_input[n_consumed])
117
+ last_n_tokens.shift
118
+ last_n_tokens.push(embd_input[n_consumed])
119
+ n_consumed += 1
120
+ break if embd.size >= options[:batch_size]
121
+ end
122
+ end
123
+
124
+ if input_echo
125
+ output = []
126
+ embd.each { |token| output << context.token_to_str(token) }
127
+ output_str = output.join
128
+ output_str.chomp!(antiprompt) if first_input
129
+ print(output_str)
130
+ end
131
+
132
+ if embd_input.size <= n_consumed
133
+ if antiprompt.size.positive?
134
+ last_output = []
135
+ last_n_tokens.each { |token| last_output << context.token_to_str(token) }
136
+ last_output_str = last_output.join
137
+
138
+ search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
139
+ unless last_output_str.index(antiprompt, search_start_pos).nil?
140
+ is_interacting = true
141
+ true
142
+ end
143
+ end
144
+
145
+ if n_past.positive? && is_interacting
146
+ if first_input
147
+ print("\r#{antiprompt}")
148
+ first_input = false
149
+ end
150
+ buffer = Readline.readline(' ')
151
+ break interactive = false if buffer.nil?
152
+
153
+ if buffer.size > 1
154
+ line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
155
+ embd_input.concat(line_input)
156
+ n_remain -= line_input.size
157
+ end
158
+
159
+ input_echo = false
160
+ end
161
+
162
+ is_interacting = false if n_past.positive?
163
+ end
164
+
165
+ if n_remain <= 0 && options[:n_predict] != -1
166
+ n_remain = options[:n_predict]
167
+ is_interacting = true
168
+ end
169
+ end
170
+ end
171
+
172
+ private
173
+
174
+ def read_prompt(filename)
175
+ return if filename.nil?
176
+
177
+ File.read(filename).chomp
178
+ end
179
+
180
+ def default_prompt(antiprompt)
181
+ # Reference:
182
+ # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
183
+ prompt = <<~MSG
184
+ Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
185
+
186
+ User: Hello, Bob.
187
+ Bob: Hello. How may I help you today?
188
+ User: Please tell me the largest city in Europe.
189
+ Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
190
+ MSG
191
+ prompt + antiprompt
192
+ end
193
+ end
194
+
195
+ Chat.start(ARGV)
@@ -300,6 +300,8 @@ public:
300
300
  rb_define_method(rb_cLLaMAContextParams, "main_gpu=", RUBY_METHOD_FUNC(_llama_context_params_set_main_gpu), 1);
301
301
  rb_define_method(rb_cLLaMAContextParams, "main_gpu", RUBY_METHOD_FUNC(_llama_context_params_get_main_gpu), 0);
302
302
  rb_define_method(rb_cLLaMAContextParams, "tensor_split", RUBY_METHOD_FUNC(_llama_context_params_get_tensor_split), 0);
303
+ rb_define_method(rb_cLLaMAContextParams, "low_vram=", RUBY_METHOD_FUNC(_llama_context_params_set_low_vram), 1);
304
+ rb_define_method(rb_cLLaMAContextParams, "low_vram", RUBY_METHOD_FUNC(_llama_context_params_get_low_vram), 0);
303
305
  rb_define_method(rb_cLLaMAContextParams, "seed=", RUBY_METHOD_FUNC(_llama_context_params_set_seed), 1);
304
306
  rb_define_method(rb_cLLaMAContextParams, "seed", RUBY_METHOD_FUNC(_llama_context_params_get_seed), 0);
305
307
  rb_define_method(rb_cLLaMAContextParams, "f16_kv=", RUBY_METHOD_FUNC(_llama_context_params_set_f16_kv), 1);
@@ -386,6 +388,18 @@ private:
386
388
  return ret;
387
389
  };
388
390
 
391
+ // low_vram
392
+ static VALUE _llama_context_params_set_low_vram(VALUE self, VALUE low_vram) {
393
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
394
+ ptr->params.low_vram = low_vram == Qtrue ? true : false;
395
+ return ptr->params.low_vram ? Qtrue : Qfalse;
396
+ };
397
+
398
+ static VALUE _llama_context_params_get_low_vram(VALUE self) {
399
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
400
+ return ptr->params.low_vram ? Qtrue : Qfalse;
401
+ };
402
+
389
403
  // seed
390
404
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
391
405
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
@@ -641,6 +655,7 @@ public:
641
655
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
642
656
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
643
657
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
658
+ rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
644
659
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
645
660
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
646
661
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -896,6 +911,43 @@ private:
896
911
  return output;
897
912
  };
898
913
 
914
+ static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
915
+ VALUE kw_args = Qnil;
916
+ ID kw_table[1] = { rb_intern("capacity") };
917
+ VALUE kw_values[1] = { Qundef };
918
+ rb_scan_args(argc, argv, ":", &kw_args);
919
+ rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
920
+
921
+ if (!RB_INTEGER_TYPE_P(kw_values[0])) {
922
+ rb_raise(rb_eArgError, "capacity must be an integer");
923
+ return Qnil;
924
+ }
925
+
926
+ LLaMAContextWrapper* ptr = get_llama_context(self);
927
+ if (ptr->ctx == NULL) {
928
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
929
+ return Qnil;
930
+ }
931
+
932
+ const int capacity = NUM2INT(kw_values[0]);
933
+ std::vector<const char*> strings;
934
+ std::vector<float> scores;
935
+ int n_vocab = llama_n_vocab(ptr->ctx);
936
+ strings.resize(n_vocab, NULL);
937
+ scores.resize(n_vocab, 0);
938
+
939
+ n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
940
+
941
+ VALUE ret_strings = rb_ary_new();
942
+ VALUE ret_scores = rb_ary_new();
943
+ for (int i = 0; i < n_vocab; i++) {
944
+ rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
945
+ rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
946
+ }
947
+
948
+ return rb_ary_new_from_args(2, ret_strings, ret_scores);
949
+ };
950
+
899
951
  static VALUE _llama_context_n_vocab(VALUE self) {
900
952
  LLaMAContextWrapper* ptr = get_llama_context(self);
901
953
  if (ptr->ctx == NULL) {