llama_cpp 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
4
- data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
3
+ metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
4
+ data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
5
5
  SHA512:
6
- metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
7
- data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
6
+ metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
7
+ data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,39 @@
1
+ ## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
2
+
3
+ - Bump bundled llama.cpp from master-4de0334 to master-a09f919.
4
+ - Add `low_vram` parameter to ContextParams.
5
+ - Add `vocab` method to Context.
6
+ - Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
7
+
8
+ ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
9
+
10
+ - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
11
+ - Fix installation files for CUDA.
12
+ - Add metal config option:
13
+ ```
14
+ $ gem install llama_cpp -- --with-metal
15
+ ```
16
+ ```ruby
17
+ require 'llama_cpp'
18
+
19
+ params = LLaMACpp::ContextParams.new
20
+ params.n_gpu_layers = 1
21
+
22
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
23
+ LLaMACpp.generate(context, 'Hello, world.')
24
+ ```
25
+
26
+ **Breaking Changes**
27
+
28
+ - Add ModelQuantizationParams class.
29
+ - Change the argument of the `model_quantize` module function in LLaMACpp.
30
+ ```ruby
31
+ require 'llama_cpp'
32
+
33
+ params = LLaMACpp::ModelQuantizeParams.new
34
+ LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
35
+ ```
36
+
1
37
  ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
2
38
 
3
39
  - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
@@ -0,0 +1,60 @@
1
+ # llama_cpp.rb/examples
2
+
3
+ ## chat.rb
4
+
5
+ ### Usage
6
+
7
+ ```sh
8
+ $ cd examples
9
+ $ gem install llama_cpp thor
10
+ $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
+ ...
12
+ User: Please tell me the largest city in Japan.
13
+ Bob: Sure. The largest city in Japan is Tokyo.
14
+ User:
15
+ ```
16
+
17
+ ### Options
18
+
19
+ ```sh
20
+ $ ./chat.rb help main
21
+ Usage:
22
+ chat.rb main -m, --model=MODEL
23
+
24
+ Options:
25
+ -s, [--seed=N] # random seed
26
+ # Default: -1
27
+ -t, [--threads=N] # number of threads
28
+ # Default: 2
29
+ -m, --model=MODEL # path to model file
30
+ -f, [--file=FILE] # prompt file to start generation
31
+ -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
+ -b, [--batch-size=N] # batch size for prompt processing
33
+ # Default: 1024
34
+ -n, [--n-predict=N] # number of tokens to predict
35
+ # Default: 256
36
+ [--keep=N] # number of tokens to keep from the initial prompt
37
+ # Default: 48
38
+ [--repeat-last-n=N] # last n tokens to consider for penalize
39
+ # Default: 64
40
+ [--repeat-penalty=N] # penalize repeat sequence of tokens
41
+ # Default: 1.0
42
+ [--presence-penalty=N] # repeat alpha presence penalty
43
+ # Default: 0.0
44
+ [--frequency-penalty=N] # repeat alpha frequency penalty
45
+ # Default: 0.0
46
+ [--top-k=N] # top k sampling
47
+ # Default: 40
48
+ [--top-p=N] # top p sampling
49
+ # Default: 0.95
50
+ [--tfs-z=N] # tail free sampling, parameter z
51
+ # Default: 1.0
52
+ [--typical-p=N] # locally typical sampling, parameter p
53
+ # Default: 1.0
54
+ [--temp=N] # temperature
55
+ # Default: 0.8
56
+ [--n-gpu-layers=N] # number of layers on GPU
57
+ # Default: 0
58
+
59
+ Start chat
60
+ ```
data/examples/chat.rb ADDED
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
+ # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
+
9
+ require 'llama_cpp'
10
+ require 'thor'
11
+ require 'readline'
12
+
13
+ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
+ default_command :main
15
+ desc 'main', 'Start chat'
16
+ option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
17
+ option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
18
+ option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
+ option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
+ option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
+ option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
+ option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
+ option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
+ option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
+ option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
+ option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
+ option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
+ option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
+ option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
+ option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
+ option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
+ option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
+ option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
+ params = LLaMACpp::ContextParams.new
36
+ params.seed = options[:seed]
37
+ params.n_gpu_layers = options[:n_gpu_layers]
38
+ context = LLaMACpp::Context.new(model_path: options[:model], params: params)
39
+
40
+ antiprompt = options[:reverse_prompt] || 'User:'
41
+ start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
42
+
43
+ embd_input = context.tokenize(text: start_prompt, add_bos: true)
44
+
45
+ n_ctx = context.n_ctx
46
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
47
+
48
+ n_keep = options[:keep]
49
+ n_keep = embd_input.size if n_keep > embd_input.size
50
+
51
+ token_newline = context.tokenize(text: "\n", add_bos: false)
52
+
53
+ last_n_tokens = [0] * n_ctx
54
+ interactive = true
55
+ is_interacting = false
56
+ input_echo = true
57
+ first_input = true
58
+ embd = []
59
+ n_consumed = 0
60
+ n_past = 0
61
+ n_remain = options[:n_predict]
62
+ n_vocab = context.n_vocab
63
+
64
+ while interactive
65
+ unless embd.empty?
66
+ if n_past + embd.size > n_ctx
67
+ n_left = n_past - n_keep
68
+ n_past = [1, n_keep].max
69
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
70
+ end
71
+
72
+ 0.step(embd.size - 1, options[:batch_size]) do |i|
73
+ n_eval = [options[:batch_size], embd.size - i].min
74
+ context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
75
+ n_past += n_eval
76
+ end
77
+ end
78
+
79
+ embd.clear
80
+
81
+ if embd_input.size <= n_consumed && !is_interacting
82
+ logits = context.logits
83
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
84
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
85
+
86
+ last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
87
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
88
+ context.sample_frequency_and_presence_penalties(
89
+ candidates, last_n_tokens[-last_n_repeat..],
90
+ frequency: options[:frequency_penalty], presence: options[:presence_penalty]
91
+ )
92
+
93
+ context.sample_top_k(candidates, k: options[:top_k])
94
+ context.sample_tail_free(candidates, z: options[:tfs_z])
95
+ context.sample_typical(candidates, prob: options[:typical_p])
96
+ context.sample_top_p(candidates, prob: options[:top_p])
97
+ context.sample_temperature(candidates, temperature: options[:temp])
98
+ id = context.sample_token(candidates)
99
+
100
+ last_n_tokens.shift
101
+ last_n_tokens.push(id)
102
+
103
+ if id == LLaMACpp.token_eos
104
+ id = token_newline.first
105
+ unless antiprompt.empty?
106
+ first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
107
+ embd_input.concat(first_antiprompt)
108
+ end
109
+ end
110
+
111
+ embd.push(id)
112
+ input_echo = true
113
+ n_remain -= 1
114
+ else
115
+ while embd_input.size > n_consumed
116
+ embd.push(embd_input[n_consumed])
117
+ last_n_tokens.shift
118
+ last_n_tokens.push(embd_input[n_consumed])
119
+ n_consumed += 1
120
+ break if embd.size >= options[:batch_size]
121
+ end
122
+ end
123
+
124
+ if input_echo
125
+ output = []
126
+ embd.each { |token| output << context.token_to_str(token) }
127
+ output_str = output.join
128
+ output_str.chomp!(antiprompt) if first_input
129
+ print(output_str)
130
+ end
131
+
132
+ if embd_input.size <= n_consumed
133
+ if antiprompt.size.positive?
134
+ last_output = []
135
+ last_n_tokens.each { |token| last_output << context.token_to_str(token) }
136
+ last_output_str = last_output.join
137
+
138
+ search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
139
+ unless last_output_str.index(antiprompt, search_start_pos).nil?
140
+ is_interacting = true
141
+ true
142
+ end
143
+ end
144
+
145
+ if n_past.positive? && is_interacting
146
+ if first_input
147
+ print("\r#{antiprompt}")
148
+ first_input = false
149
+ end
150
+ buffer = Readline.readline(' ')
151
+ break interactive = false if buffer.nil?
152
+
153
+ if buffer.size > 1
154
+ line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
155
+ embd_input.concat(line_input)
156
+ n_remain -= line_input.size
157
+ end
158
+
159
+ input_echo = false
160
+ end
161
+
162
+ is_interacting = false if n_past.positive?
163
+ end
164
+
165
+ if n_remain <= 0 && options[:n_predict] != -1
166
+ n_remain = options[:n_predict]
167
+ is_interacting = true
168
+ end
169
+ end
170
+ end
171
+
172
+ private
173
+
174
+ def read_prompt(filename)
175
+ return if filename.nil?
176
+
177
+ File.read(filename).chomp
178
+ end
179
+
180
+ def default_prompt(antiprompt)
181
+ # Reference:
182
+ # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
183
+ prompt = <<~MSG
184
+ Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
185
+
186
+ User: Hello, Bob.
187
+ Bob: Hello. How may I help you today?
188
+ User: Please tell me the largest city in Europe.
189
+ Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
190
+ MSG
191
+ prompt + antiprompt
192
+ end
193
+ end
194
+
195
+ Chat.start(ARGV)
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'mkmf'
4
+ require 'fileutils'
4
5
 
5
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
6
7
 
@@ -36,17 +37,30 @@ if with_config('accelerate')
36
37
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
37
38
  end
38
39
 
40
+ if with_config('metal')
41
+ $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
42
+ $CXXFLAGS << ' -DGGML_USE_METAL'
43
+ $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
44
+ $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
45
+ end
46
+
39
47
  if with_config('cublas')
40
48
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
49
+ $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
41
50
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
42
51
  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
43
52
  end
44
53
 
45
54
  if with_config('clblast')
46
55
  abort 'libclblast is not found.' unless have_library('clblast')
47
- abort 'libOpenCL is not found.' unless have_library('OpenCL')
48
56
 
49
57
  $CFLAGS << ' -DGGML_USE_CLBLAST'
58
+ $CXXFLAGS << ' -DGGML_USE_CLBLAST'
59
+ if RUBY_PLATFORM.match?(/darwin/)
60
+ $LDFLAGS << ' -framework OpenCL'
61
+ else
62
+ abort 'libOpenCL is not found.' unless have_library('OpenCL')
63
+ end
50
64
  end
51
65
 
52
66
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -78,3 +92,14 @@ if with_config('cublas')
78
92
  f.puts "\tnvcc -arch=native -c -o $@ $<"
79
93
  end
80
94
  end
95
+
96
+ if with_config('metal')
97
+ File.open('Makefile', 'a') do |f|
98
+ f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
99
+ f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
100
+ end
101
+
102
+ metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
103
+ dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
104
+ FileUtils.cp(metal_path, dest_path)
105
+ end