llama_cpp 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 854493444a65cd1239649b991c8e6538c542c02a052932f6a69c56c984e28f58
|
4
|
+
data.tar.gz: 4e0b70de25eb2661b693af0d488efd25f570c3f62d4b9044fdd5c14fb5b9fac6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b2524b8eb6e8568116f3c33eb57b764044083ee2ff2bbb7f15fc6301b024197ea8fca75968535b302a9e70449c9f9f28e0760cf4bfefb00ed8137c18e84137d5
|
7
|
+
data.tar.gz: faf26b552a8a862a97129b5bd25e05b3ae3edd2f8b118622b119634e4b004c05d200653c40085e4a28243c8994c517699baa35d3a8096ad8ac598fd637cf0565
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.6.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.3...v0.6.0)] - 2023-09-30
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from b1266 to b1292.
|
5
|
+
- There are many API changes, so please refer to the commits.
|
6
|
+
|
7
|
+
It is becoming difficult to keep up with major changes in llama.cpp,
|
8
|
+
and I may give up on developing this gem in the future to prioritize my own life.
|
9
|
+
|
1
10
|
## [[0.5.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.2...v0.5.3)] - 2023-09-23
|
2
11
|
|
3
12
|
- Bump bundled llama.cpp from b1 to b1266.
|
data/README.md
CHANGED
@@ -59,13 +59,14 @@ An example of Ruby code that generates sentences with the quantization model is
|
|
59
59
|
```ruby
|
60
60
|
require 'llama_cpp'
|
61
61
|
|
62
|
-
|
63
|
-
|
62
|
+
model_params = LLaMACpp::ModelParams.new
|
63
|
+
model = LLaMACpp::Model.new(model_path: '/home/user/llama.cpp/models/open_llama_7b/ggml-model-q4_0.bin', params: model_params)
|
64
64
|
|
65
|
-
|
66
|
-
|
65
|
+
context_params = LLaMACpp::ContextParams.new
|
66
|
+
context_params.seed = 42
|
67
|
+
context = LLaMACpp::Context.new(model: model, params: context_params)
|
67
68
|
|
68
|
-
puts LLaMACpp.generate(context, 'Hello, World.'
|
69
|
+
puts LLaMACpp.generate(context, 'Hello, World.')
|
69
70
|
```
|
70
71
|
|
71
72
|
## Examples
|
data/examples/chat.rb
CHANGED
@@ -14,7 +14,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
14
14
|
default_command :main
|
15
15
|
desc 'main', 'Start chat'
|
16
16
|
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
17
|
-
option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
|
18
17
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
18
|
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
19
|
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
@@ -32,16 +31,17 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
32
31
|
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
32
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
33
|
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
mdl_params = LLaMACpp::ModelParams.new
|
35
|
+
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
36
|
+
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
37
|
+
ctx_params = LLaMACpp::ContextParams.new
|
38
|
+
ctx_params.seed = options[:seed] if options[:seed] != -1
|
39
|
+
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
40
40
|
|
41
41
|
antiprompt = options[:reverse_prompt] || 'User:'
|
42
42
|
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
43
43
|
|
44
|
-
embd_input = context.tokenize(text: start_prompt, add_bos: true)
|
44
|
+
embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
|
45
45
|
|
46
46
|
n_ctx = context.n_ctx
|
47
47
|
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
@@ -58,7 +58,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
58
58
|
n_consumed = 0
|
59
59
|
n_past = 0
|
60
60
|
n_remain = options[:n_predict]
|
61
|
-
n_vocab = context.n_vocab
|
61
|
+
n_vocab = context.model.n_vocab
|
62
62
|
|
63
63
|
while interactive
|
64
64
|
unless embd.empty?
|
@@ -70,7 +70,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
70
70
|
|
71
71
|
0.step(embd.size - 1, options[:batch_size]) do |i|
|
72
72
|
n_eval = [options[:batch_size], embd.size - i].min
|
73
|
-
context.eval(tokens: embd[i...i + n_eval], n_past: n_past
|
73
|
+
context.eval(tokens: embd[i...i + n_eval], n_past: n_past)
|
74
74
|
n_past += n_eval
|
75
75
|
end
|
76
76
|
end
|
@@ -102,7 +102,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
102
102
|
if id == context.token_eos
|
103
103
|
id = context.token_nl
|
104
104
|
unless antiprompt.empty?
|
105
|
-
first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
|
105
|
+
first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
|
106
106
|
embd_input.concat(first_antiprompt)
|
107
107
|
end
|
108
108
|
end
|
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
122
122
|
|
123
123
|
if input_echo
|
124
124
|
output = []
|
125
|
-
embd.each { |token| output << context.token_to_piece(token) }
|
125
|
+
embd.each { |token| output << context.model.token_to_piece(token) }
|
126
126
|
output_str = output.join
|
127
127
|
output_str.chomp!(antiprompt) if first_input
|
128
128
|
print(output_str)
|
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
131
131
|
if embd_input.size <= n_consumed
|
132
132
|
if antiprompt.size.positive?
|
133
133
|
last_output = []
|
134
|
-
last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
|
134
|
+
last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
|
135
135
|
last_output_str = last_output.join
|
136
136
|
|
137
137
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
@@ -150,7 +150,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
150
150
|
break interactive = false if buffer.nil?
|
151
151
|
|
152
152
|
if buffer.size > 1
|
153
|
-
line_input = context.tokenize(text: "#{buffer}\n", add_bos: false)
|
153
|
+
line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
|
154
154
|
embd_input.concat(line_input)
|
155
155
|
n_remain -= line_input.size
|
156
156
|
end
|
data/examples/embedding.rb
CHANGED
@@ -12,23 +12,23 @@ class Embedding < Thor # rubocop:disable Style/Documentation
|
|
12
12
|
default_command :main
|
13
13
|
desc 'main', 'Extract embedding from prompt'
|
14
14
|
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
15
|
-
option :threads, type: :numeric, aliases: '-t', desc: 'number of threads', default: 2
|
16
15
|
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
17
16
|
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
18
17
|
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
18
|
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
19
|
+
mdl_params = LLaMACpp::ModelParams.new
|
20
|
+
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
21
|
+
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
22
|
+
ctx_params = LLaMACpp::ContextParams.new
|
23
|
+
ctx_params.embedding = true
|
24
|
+
ctx_params.seed = options[:seed] if options[:seed] != -1
|
25
|
+
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
26
26
|
|
27
|
-
embd_input = context.tokenize(text: options[:prompt], add_bos: true)
|
27
|
+
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
28
28
|
|
29
29
|
return unless embd_input.size.positive?
|
30
30
|
|
31
|
-
context.eval(tokens: embd_input, n_past: 0
|
31
|
+
context.eval(tokens: embd_input, n_past: 0)
|
32
32
|
|
33
33
|
context.embeddings.each { |val| print("#{val} ") }
|
34
34
|
print("\n")
|