llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ad6a2964cfc46e940026d76a3d340509ba8c30fdaf3902730081f44b7b40cfde
|
4
|
+
data.tar.gz: 48384234163db26b7ee45d12310ba09b1a8f4f37906ede2f9d89eb72f05df665
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 132095fecc385ca629dc051d27bafddccf893def0702690abcaf7c3b87900c643ff301bf5f3f27db99a5c58ecb90385210e35c935cf2bd99f00b2675374b31c8
|
7
|
+
data.tar.gz: 5987962a6d84cdf7e7a171be41e7df96a0dab94d54f408df20303d4d1622ea851c6367d9773d4d985eaa1ba77f804ab730580a1a0a4374e96b5153c1a2471ed1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,39 @@
|
|
1
|
+
## [[0.2.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.0...v0.2.1)] - 2023-06-17
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-4de0334 to master-a09f919.
|
4
|
+
- Add `low_vram` parameter to ContextParams.
|
5
|
+
- Add `vocab` method to Context.
|
6
|
+
- Add example script: https://github.com/yoshoku/llama_cpp.rb/tree/main/examples
|
7
|
+
|
8
|
+
## [[0.2.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.4...v0.2.0)] - 2023-06-11
|
9
|
+
|
10
|
+
- Bump bundled llama.cpp from master-ffb06a3 to master-4de0334.
|
11
|
+
- Fix installation files for CUDA.
|
12
|
+
- Add metal config option:
|
13
|
+
```
|
14
|
+
$ gem install llama_cpp -- --with-metal
|
15
|
+
```
|
16
|
+
```ruby
|
17
|
+
require 'llama_cpp'
|
18
|
+
|
19
|
+
params = LLaMACpp::ContextParams.new
|
20
|
+
params.n_gpu_layers = 1
|
21
|
+
|
22
|
+
context = LLaMACpp::Context.new(model_path: '/path/to/quantized-model.bin', params: params)
|
23
|
+
LLaMACpp.generate(context, 'Hello, world.')
|
24
|
+
```
|
25
|
+
|
26
|
+
**Breaking Changes**
|
27
|
+
|
28
|
+
- Add ModelQuantizationParams class.
|
29
|
+
- Change the argument of the `model_quantize` module function in LLaMACpp.
|
30
|
+
```ruby
|
31
|
+
require 'llama_cpp'
|
32
|
+
|
33
|
+
params = LLaMACpp::ModelQuantizeParams.new
|
34
|
+
LLaMACpp.model_quantize(input_path: 'foo.model', output_path: 'bar.model', params: params)
|
35
|
+
```
|
36
|
+
|
1
37
|
## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
|
2
38
|
|
3
39
|
- Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
|
data/examples/README.md
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# llama_cpp.rb/examples
|
2
|
+
|
3
|
+
## chat.rb
|
4
|
+
|
5
|
+
### Usage
|
6
|
+
|
7
|
+
```sh
|
8
|
+
$ cd examples
|
9
|
+
$ gem install llama_cpp thor
|
10
|
+
$ ./chat.rb -m /path/to/quantized-model.bin -t 4
|
11
|
+
...
|
12
|
+
User: Please tell me the largest city in Japan.
|
13
|
+
Bob: Sure. The largest city in Japan is Tokyo.
|
14
|
+
User:
|
15
|
+
```
|
16
|
+
|
17
|
+
### Options
|
18
|
+
|
19
|
+
```sh
|
20
|
+
$ ./chat.rb help main
|
21
|
+
Usage:
|
22
|
+
chat.rb main -m, --model=MODEL
|
23
|
+
|
24
|
+
Options:
|
25
|
+
-s, [--seed=N] # random seed
|
26
|
+
# Default: -1
|
27
|
+
-t, [--threads=N] # number of threads
|
28
|
+
# Default: 2
|
29
|
+
-m, --model=MODEL # path to model file
|
30
|
+
-f, [--file=FILE] # prompt file to start generation
|
31
|
+
-r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
|
32
|
+
-b, [--batch-size=N] # batch size for prompt processing
|
33
|
+
# Default: 1024
|
34
|
+
-n, [--n-predict=N] # number of tokens to predict
|
35
|
+
# Default: 256
|
36
|
+
[--keep=N] # number of tokens to keep from the initial prompt
|
37
|
+
# Default: 48
|
38
|
+
[--repeat-last-n=N] # last n tokens to consider for penalize
|
39
|
+
# Default: 64
|
40
|
+
[--repeat-penalty=N] # penalize repeat sequence of tokens
|
41
|
+
# Default: 1.0
|
42
|
+
[--presence-penalty=N] # repeat alpha presence penalty
|
43
|
+
# Default: 0.0
|
44
|
+
[--frequency-penalty=N] # repeat alpha frequency penalty
|
45
|
+
# Default: 0.0
|
46
|
+
[--top-k=N] # top k sampling
|
47
|
+
# Default: 40
|
48
|
+
[--top-p=N] # top p sampling
|
49
|
+
# Default: 0.95
|
50
|
+
[--tfs-z=N] # tail free sampling, parameter z
|
51
|
+
# Default: 1.0
|
52
|
+
[--typical-p=N] # locally typical sampling, parameter p
|
53
|
+
# Default: 1.0
|
54
|
+
[--temp=N] # temperature
|
55
|
+
# Default: 0.8
|
56
|
+
[--n-gpu-layers=N] # number of layers on GPU
|
57
|
+
# Default: 0
|
58
|
+
|
59
|
+
Start chat
|
60
|
+
```
|
data/examples/chat.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# chat.rb is a simple chatbot that uses llama_cpp to generate text.
|
5
|
+
# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
|
6
|
+
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
|
7
|
+
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
|
8
|
+
|
9
|
+
require 'llama_cpp'
|
10
|
+
require 'thor'
|
11
|
+
require 'readline'
|
12
|
+
|
13
|
+
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
14
|
+
default_command :main
|
15
|
+
desc 'main', 'Start chat'
|
16
|
+
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
17
|
+
option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
|
18
|
+
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
|
+
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
|
+
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
21
|
+
option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
|
22
|
+
option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
|
23
|
+
option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
|
24
|
+
option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
|
25
|
+
option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
|
26
|
+
option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
|
27
|
+
option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
|
28
|
+
option :top_k, type: :numeric, desc: 'top k sampling', default: 40
|
29
|
+
option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
|
30
|
+
option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
|
31
|
+
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
32
|
+
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
|
+
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
+
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
35
|
+
params = LLaMACpp::ContextParams.new
|
36
|
+
params.seed = options[:seed]
|
37
|
+
params.n_gpu_layers = options[:n_gpu_layers]
|
38
|
+
context = LLaMACpp::Context.new(model_path: options[:model], params: params)
|
39
|
+
|
40
|
+
antiprompt = options[:reverse_prompt] || 'User:'
|
41
|
+
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
42
|
+
|
43
|
+
embd_input = context.tokenize(text: start_prompt, add_bos: true)
|
44
|
+
|
45
|
+
n_ctx = context.n_ctx
|
46
|
+
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
47
|
+
|
48
|
+
n_keep = options[:keep]
|
49
|
+
n_keep = embd_input.size if n_keep > embd_input.size
|
50
|
+
|
51
|
+
token_newline = context.tokenize(text: "\n", add_bos: false)
|
52
|
+
|
53
|
+
last_n_tokens = [0] * n_ctx
|
54
|
+
interactive = true
|
55
|
+
is_interacting = false
|
56
|
+
input_echo = true
|
57
|
+
first_input = true
|
58
|
+
embd = []
|
59
|
+
n_consumed = 0
|
60
|
+
n_past = 0
|
61
|
+
n_remain = options[:n_predict]
|
62
|
+
n_vocab = context.n_vocab
|
63
|
+
|
64
|
+
while interactive
|
65
|
+
unless embd.empty?
|
66
|
+
if n_past + embd.size > n_ctx
|
67
|
+
n_left = n_past - n_keep
|
68
|
+
n_past = [1, n_keep].max
|
69
|
+
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
70
|
+
end
|
71
|
+
|
72
|
+
0.step(embd.size - 1, options[:batch_size]) do |i|
|
73
|
+
n_eval = [options[:batch_size], embd.size - i].min
|
74
|
+
context.eval(tokens: embd[i...i + n_eval], n_past: n_past, n_threads: options[:threads])
|
75
|
+
n_past += n_eval
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
embd.clear
|
80
|
+
|
81
|
+
if embd_input.size <= n_consumed && !is_interacting
|
82
|
+
logits = context.logits
|
83
|
+
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
84
|
+
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
85
|
+
|
86
|
+
last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
|
87
|
+
context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: options[:repeat_penalty])
|
88
|
+
context.sample_frequency_and_presence_penalties(
|
89
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
90
|
+
frequency: options[:frequency_penalty], presence: options[:presence_penalty]
|
91
|
+
)
|
92
|
+
|
93
|
+
context.sample_top_k(candidates, k: options[:top_k])
|
94
|
+
context.sample_tail_free(candidates, z: options[:tfs_z])
|
95
|
+
context.sample_typical(candidates, prob: options[:typical_p])
|
96
|
+
context.sample_top_p(candidates, prob: options[:top_p])
|
97
|
+
context.sample_temperature(candidates, temperature: options[:temp])
|
98
|
+
id = context.sample_token(candidates)
|
99
|
+
|
100
|
+
last_n_tokens.shift
|
101
|
+
last_n_tokens.push(id)
|
102
|
+
|
103
|
+
if id == LLaMACpp.token_eos
|
104
|
+
id = token_newline.first
|
105
|
+
unless antiprompt.empty?
|
106
|
+
first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
|
107
|
+
embd_input.concat(first_antiprompt)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
embd.push(id)
|
112
|
+
input_echo = true
|
113
|
+
n_remain -= 1
|
114
|
+
else
|
115
|
+
while embd_input.size > n_consumed
|
116
|
+
embd.push(embd_input[n_consumed])
|
117
|
+
last_n_tokens.shift
|
118
|
+
last_n_tokens.push(embd_input[n_consumed])
|
119
|
+
n_consumed += 1
|
120
|
+
break if embd.size >= options[:batch_size]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
if input_echo
|
125
|
+
output = []
|
126
|
+
embd.each { |token| output << context.token_to_str(token) }
|
127
|
+
output_str = output.join
|
128
|
+
output_str.chomp!(antiprompt) if first_input
|
129
|
+
print(output_str)
|
130
|
+
end
|
131
|
+
|
132
|
+
if embd_input.size <= n_consumed
|
133
|
+
if antiprompt.size.positive?
|
134
|
+
last_output = []
|
135
|
+
last_n_tokens.each { |token| last_output << context.token_to_str(token) }
|
136
|
+
last_output_str = last_output.join
|
137
|
+
|
138
|
+
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
139
|
+
unless last_output_str.index(antiprompt, search_start_pos).nil?
|
140
|
+
is_interacting = true
|
141
|
+
true
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if n_past.positive? && is_interacting
|
146
|
+
if first_input
|
147
|
+
print("\r#{antiprompt}")
|
148
|
+
first_input = false
|
149
|
+
end
|
150
|
+
buffer = Readline.readline(' ')
|
151
|
+
break interactive = false if buffer.nil?
|
152
|
+
|
153
|
+
if buffer.size > 1
|
154
|
+
line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
|
155
|
+
embd_input.concat(line_input)
|
156
|
+
n_remain -= line_input.size
|
157
|
+
end
|
158
|
+
|
159
|
+
input_echo = false
|
160
|
+
end
|
161
|
+
|
162
|
+
is_interacting = false if n_past.positive?
|
163
|
+
end
|
164
|
+
|
165
|
+
if n_remain <= 0 && options[:n_predict] != -1
|
166
|
+
n_remain = options[:n_predict]
|
167
|
+
is_interacting = true
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def read_prompt(filename)
|
175
|
+
return if filename.nil?
|
176
|
+
|
177
|
+
File.read(filename).chomp
|
178
|
+
end
|
179
|
+
|
180
|
+
def default_prompt(antiprompt)
|
181
|
+
# Reference:
|
182
|
+
# https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
|
183
|
+
prompt = <<~MSG
|
184
|
+
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
185
|
+
|
186
|
+
User: Hello, Bob.
|
187
|
+
Bob: Hello. How may I help you today?
|
188
|
+
User: Please tell me the largest city in Europe.
|
189
|
+
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
190
|
+
MSG
|
191
|
+
prompt + antiprompt
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
Chat.start(ARGV)
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'mkmf'
|
4
|
+
require 'fileutils'
|
4
5
|
|
5
6
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
6
7
|
|
@@ -36,17 +37,30 @@ if with_config('accelerate')
|
|
36
37
|
$CFLAGS << ' -DGGML_USE_ACCELERATE'
|
37
38
|
end
|
38
39
|
|
40
|
+
if with_config('metal')
|
41
|
+
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
42
|
+
$CXXFLAGS << ' -DGGML_USE_METAL'
|
43
|
+
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders'
|
44
|
+
$objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
|
45
|
+
end
|
46
|
+
|
39
47
|
if with_config('cublas')
|
40
48
|
$CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
49
|
+
$CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
|
41
50
|
$LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
|
42
51
|
$objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
|
43
52
|
end
|
44
53
|
|
45
54
|
if with_config('clblast')
|
46
55
|
abort 'libclblast is not found.' unless have_library('clblast')
|
47
|
-
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
48
56
|
|
49
57
|
$CFLAGS << ' -DGGML_USE_CLBLAST'
|
58
|
+
$CXXFLAGS << ' -DGGML_USE_CLBLAST'
|
59
|
+
if RUBY_PLATFORM.match?(/darwin/)
|
60
|
+
$LDFLAGS << ' -framework OpenCL'
|
61
|
+
else
|
62
|
+
abort 'libOpenCL is not found.' unless have_library('OpenCL')
|
63
|
+
end
|
50
64
|
end
|
51
65
|
|
52
66
|
UNAME_M = RbConfig::CONFIG['build_cpu'] || RbConfig::CONFIG['host_cpu'] || RbConfig::CONFIG['target_cpu']
|
@@ -78,3 +92,14 @@ if with_config('cublas')
|
|
78
92
|
f.puts "\tnvcc -arch=native -c -o $@ $<"
|
79
93
|
end
|
80
94
|
end
|
95
|
+
|
96
|
+
if with_config('metal')
|
97
|
+
File.open('Makefile', 'a') do |f|
|
98
|
+
f.puts 'ggml-metal.o: ggml-metal.m ggml-metal.h'
|
99
|
+
f.puts "\t$(CC) $(CFLAGS) -c $< -o $@"
|
100
|
+
end
|
101
|
+
|
102
|
+
metal_path = File.expand_path("#{__dir__}/src/ggml-metal.metal")
|
103
|
+
dest_path = File.expand_path("#{__dir__}/../../lib/llama_cpp/")
|
104
|
+
FileUtils.cp(metal_path, dest_path)
|
105
|
+
end
|