llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
4
- data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
3
+ metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
4
+ data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
5
5
  SHA512:
6
- metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
7
- data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
6
+ metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
7
+ data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,39 @@
1
+ ## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
2
+
3
+ - Bump bundled llama.cpp from master-4de0334 to master-a09f919.
4
+ - Add `low_vram` parameter to ContextParams.
5
+ - Add `vocab` method to Context.
6
+ - Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
7
+
8
+ ## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
9
+
10
+ - Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
11
+ - Fix installation files for CUDA.
12
+ - Add metal config option:
13
+ ```
14
+ $ gem install llama_cpp -- --with-metal
15
+ ```
16
+ ```ruby
17
+ require 'llama_cpp'
18
+
19
+ params = LLaMACpp::ContextParams.new
20
+ params.n_gpu_layers = 1
21
+
22
+ context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
23
+ LLaMACpp.generate(context, 'Hello, world.')
24
+ ```
25
+
26
+ **Breaking Changes**
27
+
28
+ - Add ModelQuantizationParams class.
29
+ - Change the argument of the `model_quantize` module function in LLaMACpp.
30
+ ```ruby
31
+ require 'llama_cpp'
32
+
33
+ params = LLaMACpp::ModelQuantizeParams.new
34
+ LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
35
+ ```
36
+
1
37
  ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
2
38
 
3
39
  - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
@@ -0,0 +1,60 @@
1
+ # llama_cpp.rb/examples
2
+
3
+ ## chat.rb
4
+
5
+ ### Usage
6
+
7
+ ```sh
8
+ $ cd examples
9
+ $ gem install llama_cpp thor
10
+ $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
+ ...
12
+ User: Please tell me the largest city in Japan.
13
+ Bob: Sure. The largest city in Japan is Tokyo.
14
+ User:
15
+ ```
16
+
17
+ ### Options
18
+
19
+ ```sh
20
+ $ ./chat.rb help main
21
+ Usage:
22
+ chat.rb main -m, --model=MODEL
23
+
24
+ Options:
25
+ -s, [--seed=N] # random seed
26
+ # Default: -1
27
+ -t, [--threads=N] # number of threads
28
+ # Default: 2
29
+ -m, --model=MODEL # path to model file
30
+ -f, [--file=FILE] # prompt file to start generation
31
+ -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
+ -b, [--batch-size=N] # batch size for prompt processing
33
+ # Default: 1024
34
+ -n, [--n-predict=N] # number of tokens to predict
35
+ # Default: 256
36
+ [--keep=N] # number of tokens to keep from the initial prompt
37
+ # Default: 48
38
+ [--repeat-last-n=N] # last n tokens to consider for penalize
39
+ # Default: 64
40
+ [--repeat-penalty=N] # penalize repeat sequence of tokens
41
+ # Default: 1.0
42
+ [--presence-penalty=N] # repeat alpha presence penalty
43
+ # Default: 0.0
44
+ [--frequency-penalty=N] # repeat alpha frequency penalty
45
+ # Default: 0.0
46
+ [--top-k=N] # top k sampling
47
+ # Default: 40
48
+ [--top-p=N] # top p sampling
49
+ # Default: 0.95
50
+ [--tfs-z=N] # tail free sampling, parameter z
51
+ # Default: 1.0
52
+ [--typical-p=N] # locally typical sampling, parameter p
53
+ # Default: 1.0
54
+ [--temp=N] # temperature
55
+ # Default: 0.8
56
+ [--n-gpu-layers=N] # number of layers on GPU
57
+ # Default: 0
58
+
59
+ Start chat
60
+ ```
data/examples/chat.rb ADDED
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
+ # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
+ # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
+
9
+ require 'llama_cpp'
10
+ require 'thor'
11
+ require 'readline'
12
+
13
+ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
14
+ default_command :main
15
+ desc 'main', 'Start chat'
16
+ option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
17
+ option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
18
+ option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
+ option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
+ option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
+ option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
+ option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
+ option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
+ option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
+ option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
+ option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
+ option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
+ option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
+ option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
+ option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
+ option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
+ option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
+ option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
+ def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
+ params = LLaMACpp::ContextParams.new
36
+ params.seed = options[:seed]
37
+ params.n_gpu_layers = options[:n_gpu_layers]
38
+ context = LLaMACpp::Context.new(model_path: options[:model], params: params)
39
+
40
+ antiprompt = options[:reverse_prompt] || 'User:'
41
+ start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
42
+
43
+ embd_input = context.tokenize(text: start_prompt, add_bos: true)
44
+
45
+ n_ctx = context.n_ctx
46
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
47
+
48
+ n_keep = options[:keep]
49
+ n_keep = embd_input.size if n_keep > embd_input.size
50
+
51
+ token_newline = context.tokenize(text: "\n", add_bos: false)
52
+
53
+ last_n_tokens = [0] * n_ctx
54
+ interactive = true
55
+ is_interacting = false
56
+ input_echo = true
57
+ first_input = true
58
+ embd = []
59
+ n_consumed = 0
60
+ n_past = 0
61
+ n_remain = options[:n_predict]
62
+ n_vocab = context.n_vocab
63
+
64
+ while interactive
65
+ unless embd.empty?
66
+ if n_past + embd.size > n_ctx
67
+ n_left = n_past - n_keep
68
+ n_past = [1, n_keep].max
69
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
70
+ end
71
+
72
+ 0.step(embd.size - 1, options[:batch_size]) do |i|
73
+ n_eval = [options[:batch_size], embd.size - i].min
74
+ context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
75
+ n_past += n_eval
76
+ end
77
+ end
78
+
79
+ embd.clear
80
+
81
+ if embd_input.size <= n_consumed && !is_interacting
82
+ logits = context.logits
83
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
84
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
85
+
86
+ last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
87
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
88
+ context.sample_frequency_and_presence_penalties(
89
+ candidates, last_n_tokens[-last_n_repeat..],
90
+ frequency: options[:frequency_penalty], presence: options[:presence_penalty]
91
+ )
92
+
93
+ context.sample_top_k(candidates, k: options[:top_k])
94
+ context.sample_tail_free(candidates, z: options[:tfs_z])
95
+ context.sample_typical(candidates, prob: options[:typical_p])
96
+ context.sample_top_p(candidates, prob: options[:top_p])
97
+ context.sample_temperature(candidates, temperature: options[:temp])
98
+ id = context.sample_token(candidates)
99
+
100
+ last_n_tokens.shift
101
+ last_n_tokens.push(id)
102
+
103
+ if id == LLaMACpp.token_eos
104
+ id = token_newline.first
105
+ unless antiprompt.empty?
106
+ first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
107
+ embd_input.concat(first_antiprompt)
108
+ end
109
+ end
110
+
111
+ embd.push(id)
112
+ input_echo = true
113
+ n_remain -= 1
114
+ else
115
+ while embd_input.size > n_consumed
116
+ embd.push(embd_input[n_consumed])
117
+ last_n_tokens.shift
118
+ last_n_tokens.push(embd_input[n_consumed])
119
+ n_consumed += 1
120
+ break if embd.size >= options[:batch_size]
121
+ end
122
+ end
123
+
124
+ if input_echo
125
+ output = []
126
+ embd.each { |token| output << context.token_to_str(token) }
127
+ output_str = output.join
128
+ output_str.chomp!(antiprompt) if first_input
129
+ print(output_str)
130
+ end
131
+
132
+ if embd_input.size <= n_consumed
133
+ if antiprompt.size.positive?
134
+ last_output = []
135
+ last_n_tokens.each { |token| last_output << context.token_to_str(token) }
136
+ last_output_str = last_output.join
137
+
138
+ search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
139
+ unless last_output_str.index(antiprompt, search_start_pos).nil?
140
+ is_interacting = true
141
+ true
142
+ end
143
+ end
144
+
145
+ if n_past.positive? && is_interacting
146
+ if first_input
147
+ print("\r#{antiprompt}")
148
+ first_input = false
149
+ end
150
+ buffer = Readline.readline(' ')
151
+ break interactive = false if buffer.nil?
152
+
153
+ if buffer.size > 1
154
+ line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
155
+ embd_input.concat(line_input)
156
+ n_remain -= line_input.size
157
+ end
158
+
159
+ input_echo = false
160
+ end
161
+
162
+ is_interacting = false if n_past.positive?
163
+ end
164
+
165
+ if n_remain <= 0 && options[:n_predict] != -1
166
+ n_remain = options[:n_predict]
167
+ is_interacting = true
168
+ end
169
+ end
170
+ end
171
+
172
+ private
173
+
174
+ def read_prompt(filename)
175
+ return if filename.nil?
176
+
177
+ File.read(filename).chomp
178
+ end
179
+
180
+ def default_prompt(antiprompt)
181
+ # Reference:
182
+ # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
183
+ prompt = <<~MSG
184
+ Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
185
+
186
+ User: Hello, Bob.
187
+ Bob: Hello. How may I help you today?
188
+ User: Please tell me the largest city in Europe.
189
+ Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
190
+ MSG
191
+ prompt + antiprompt
192
+ end
193
+ end
194
+
195
+ Chat.start(ARGV)
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'mkmf'
4
+ require 'fileutils'
4
5
 
5
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
6
7
 
@@ -36,17 +37,30 @@ if with_config('accelerate')
36
37
  $CFLAGS << ' -DGGML_USE_ACCELERATE'
37
38
  end
38
39
 
40
+ if with_config('metal')
41
+ $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
42
+ $CXXFLAGS << ' -DGGML_USE_METAL'
43
+ $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
44
+ $objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
45
+ end
46
+
39
47
  if with_config('cublas')
40
48
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
49
+ $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
41
50
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
42
51
  $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
43
52
  end
44
53
 
45
54
  if with_config('clblast')
46
55
  abort 'libclblast is not found.' unless have_library('clblast')
47
- abort 'libOpenCL is not found.' unless have_library('OpenCL')
48
56
 
49
57
  $CFLAGS << ' -DGGML_USE_CLBLAST'
58
+ $CXXFLAGS << ' -DGGML_USE_CLBLAST'
59
+ if RUBY_PLATFORM.match?(/darwin/)
60
+ $LDFLAGS << ' -framework OpenCL'
61
+ else
62
+ abort 'libOpenCL is not found.' unless have_library('OpenCL')
63
+ end
50
64
  end
51
65
 
52
66
  UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
@@ -78,3 +92,14 @@ if with_config('cublas')
78
92
  f.puts "\tnvcc -arch=native -c -o $@ $<"
79
93
  end
80
94
  end
95
+
96
+ if with_config('metal')
97
+ File.open('Makefile', 'a') do |f|
98
+ f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
99
+ f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
100
+ end
101
+
102
+ metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
103
+ dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
104
+ FileUtils.cp(metal_path, dest_path)
105
+ end