llama_cpp 0.17.10 → 0.18.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -31
- data/ext/llama_cpp/extconf.rb +0 -3
- data/ext/llama_cpp/llama_cpp.c +5174 -0
- data/ext/llama_cpp/llama_cpp.h +0 -5
- data/lib/llama_cpp/version.rb +3 -3
- data/lib/llama_cpp.rb +38 -83
- metadata +4 -13
- data/examples/README.md +0 -92
- data/examples/chat.rb +0 -198
- data/examples/embedding.rb +0 -42
- data/examples/prompt_jp.txt +0 -8
- data/examples/simple.rb +0 -96
- data/ext/llama_cpp/llama_cpp.cpp +0 -3764
- data/sig/llama_cpp.rbs +0 -425
data/ext/llama_cpp/llama_cpp.h
CHANGED
data/lib/llama_cpp/version.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
|
-
module
|
4
|
+
module LlamaCpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.18.1'
|
7
7
|
|
8
8
|
# The supported version of llama.cpp.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b4713'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
|
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
|
-
module
|
7
|
+
module LlamaCpp
|
8
8
|
module_function
|
9
9
|
|
10
10
|
# Generates sentences following the given prompt for operation check.
|
11
11
|
#
|
12
|
-
# @param context [
|
12
|
+
# @param context [LlamaCpp::LlamaContext] The context to use.
|
13
13
|
# @param prompt [String] The prompt to start generation with.
|
14
14
|
# @param n_predict [Integer] The number of tokens to predict.
|
15
|
-
# @param n_keep [Integer] The number of tokens to keep in the context.
|
16
|
-
# @param n_batch [Integer] The number of tokens to process in a batch.
|
17
|
-
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
18
|
-
# @param repeat_penalty [Float] The repetition penalty.
|
19
|
-
# @param frequency [Float] The frequency penalty.
|
20
|
-
# @param presence [Float] The presence penalty.
|
21
|
-
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
22
|
-
# @param top_p [Float] The probability threshold for nucleus sampling.
|
23
|
-
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
24
|
-
# @param typical_p [Float] The probability for typical sampling.
|
25
|
-
# @param temperature [Float] The temperature for temperature sampling.
|
26
15
|
# @return [String]
|
27
|
-
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/
|
28
|
-
|
29
|
-
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
30
|
-
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
31
|
-
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
16
|
+
def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
|
+
raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
|
32
18
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
33
19
|
|
34
|
-
|
35
|
-
|
20
|
+
model = LlamaCpp.llama_get_model(context)
|
21
|
+
vocab = LlamaCpp.llama_model_get_vocab(model)
|
36
22
|
|
37
|
-
|
38
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
23
|
+
n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
|
39
24
|
|
40
|
-
|
25
|
+
prompt_tokens = []
|
26
|
+
raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
|
27
|
+
true).negative?
|
41
28
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
ctx_params = LlamaCpp::LlamaContextParams.new
|
30
|
+
ctx_params.n_ctx = n_prompt + n_predict - 1
|
31
|
+
ctx_params.n_batch = n_prompt
|
32
|
+
ctx_params.no_perf = false
|
33
|
+
|
34
|
+
ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
|
35
|
+
|
36
|
+
sparams = LlamaCpp::LlamaSamplerChainParams.new
|
37
|
+
sparams.no_perf = false
|
38
|
+
smpl = LlamaCpp.llama_sampler_chain_init(sparams)
|
39
|
+
LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
|
40
|
+
|
41
|
+
batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
|
42
|
+
|
43
|
+
n_pos = 0
|
47
44
|
output = []
|
45
|
+
while n_pos + batch.n_tokens < n_prompt + n_predict
|
46
|
+
break if LlamaCpp.llama_decode(ctx, batch) != 0
|
47
|
+
|
48
|
+
n_pos += batch.n_tokens
|
49
|
+
|
50
|
+
new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
|
51
|
+
break if llama_vocab_is_eog?(vocab, new_token_id)
|
52
|
+
|
53
|
+
buf = llama_token_to_piece(vocab, new_token_id, 0, true)
|
54
|
+
output << buf
|
48
55
|
|
49
|
-
|
50
|
-
unless embd.empty?
|
51
|
-
if n_past + embd.size > n_ctx
|
52
|
-
n_left = n_past - n_keep
|
53
|
-
n_past = n_keep
|
54
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
|
-
end
|
56
|
-
|
57
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
|
-
end
|
59
|
-
|
60
|
-
n_past += embd.size
|
61
|
-
embd.clear
|
62
|
-
|
63
|
-
if embd_input.size <= n_consumed
|
64
|
-
logits = context.logits
|
65
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
66
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
67
|
-
|
68
|
-
# apply penalties
|
69
|
-
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.sample_repetition_penalties(
|
71
|
-
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
-
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
|
-
)
|
74
|
-
|
75
|
-
# temperature sampling
|
76
|
-
context.sample_top_k(candidates, k: top_k)
|
77
|
-
context.sample_tail_free(candidates, z: tfs_z)
|
78
|
-
context.sample_typical(candidates, prob: typical_p)
|
79
|
-
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.sample_temp(candidates, temp: temperature)
|
81
|
-
id = context.sample_token(candidates)
|
82
|
-
|
83
|
-
last_n_tokens.shift
|
84
|
-
last_n_tokens.push(id)
|
85
|
-
|
86
|
-
embd.push(id)
|
87
|
-
n_remain -= 1
|
88
|
-
else
|
89
|
-
while embd_input.size > n_consumed
|
90
|
-
embd.push(embd_input[n_consumed])
|
91
|
-
last_n_tokens.shift
|
92
|
-
last_n_tokens.push(embd_input[n_consumed])
|
93
|
-
n_consumed += 1
|
94
|
-
break if embd.size >= n_batch
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
|
-
|
100
|
-
break if !embd.empty? && embd[-1] == context.model.token_eos
|
56
|
+
batch = LlamaCpp.llama_batch_get_one([new_token_id])
|
101
57
|
end
|
102
58
|
|
103
|
-
output.join
|
59
|
+
output.join
|
104
60
|
end
|
105
61
|
end
|
106
62
|
|
107
|
-
LLaMACpp
|
108
|
-
at_exit { LLaMACpp.backend_free }
|
63
|
+
LLaMACpp = LlamaCpp
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-02-15 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
13
|
email:
|
@@ -22,17 +21,11 @@ files:
|
|
22
21
|
- CODE_OF_CONDUCT.md
|
23
22
|
- LICENSE.txt
|
24
23
|
- README.md
|
25
|
-
- examples/README.md
|
26
|
-
- examples/chat.rb
|
27
|
-
- examples/embedding.rb
|
28
|
-
- examples/prompt_jp.txt
|
29
|
-
- examples/simple.rb
|
30
24
|
- ext/llama_cpp/extconf.rb
|
31
|
-
- ext/llama_cpp/llama_cpp.
|
25
|
+
- ext/llama_cpp/llama_cpp.c
|
32
26
|
- ext/llama_cpp/llama_cpp.h
|
33
27
|
- lib/llama_cpp.rb
|
34
28
|
- lib/llama_cpp/version.rb
|
35
|
-
- sig/llama_cpp.rbs
|
36
29
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|
37
30
|
licenses:
|
38
31
|
- MIT
|
@@ -42,7 +35,6 @@ metadata:
|
|
42
35
|
changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
|
43
36
|
documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
|
44
37
|
rubygems_mfa_required: 'true'
|
45
|
-
post_install_message:
|
46
38
|
rdoc_options: []
|
47
39
|
require_paths:
|
48
40
|
- lib
|
@@ -57,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
49
|
- !ruby/object:Gem::Version
|
58
50
|
version: '0'
|
59
51
|
requirements: []
|
60
|
-
rubygems_version: 3.
|
61
|
-
signing_key:
|
52
|
+
rubygems_version: 3.6.2
|
62
53
|
specification_version: 4
|
63
54
|
summary: Ruby bindings for the llama.cpp.
|
64
55
|
test_files: []
|
data/examples/README.md
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
# llama_cpp.rb/examples
|
2
|
-
|
3
|
-
## chat.rb
|
4
|
-
|
5
|
-
### Usage
|
6
|
-
|
7
|
-
```sh
|
8
|
-
$ cd examples
|
9
|
-
$ gem install llama_cpp thor
|
10
|
-
$ ./chat.rb -m /path/to/quantized-model.bin -t 4
|
11
|
-
...
|
12
|
-
User: Please tell me the largest city in Japan.
|
13
|
-
Bob: Sure. The largest city in Japan is Tokyo.
|
14
|
-
User:
|
15
|
-
```
|
16
|
-
|
17
|
-
### Options
|
18
|
-
|
19
|
-
```sh
|
20
|
-
$ ./chat.rb help main
|
21
|
-
Usage:
|
22
|
-
chat.rb main -m, --model=MODEL
|
23
|
-
|
24
|
-
Options:
|
25
|
-
-s, [--seed=N] # random seed
|
26
|
-
# Default: -1
|
27
|
-
-t, [--threads=N] # number of threads
|
28
|
-
# Default: 2
|
29
|
-
-m, --model=MODEL # path to model file
|
30
|
-
-f, [--file=FILE] # prompt file to start generation
|
31
|
-
-r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
|
32
|
-
-b, [--batch-size=N] # batch size for prompt processing
|
33
|
-
# Default: 1024
|
34
|
-
-n, [--n-predict=N] # number of tokens to predict
|
35
|
-
# Default: 256
|
36
|
-
[--keep=N] # number of tokens to keep from the initial prompt
|
37
|
-
# Default: 48
|
38
|
-
[--repeat-last-n=N] # last n tokens to consider for penalize
|
39
|
-
# Default: 64
|
40
|
-
[--repeat-penalty=N] # penalize repeat sequence of tokens
|
41
|
-
# Default: 1.0
|
42
|
-
[--presence-penalty=N] # repeat alpha presence penalty
|
43
|
-
# Default: 0.0
|
44
|
-
[--frequency-penalty=N] # repeat alpha frequency penalty
|
45
|
-
# Default: 0.0
|
46
|
-
[--top-k=N] # top k sampling
|
47
|
-
# Default: 40
|
48
|
-
[--top-p=N] # top p sampling
|
49
|
-
# Default: 0.95
|
50
|
-
[--tfs-z=N] # tail free sampling, parameter z
|
51
|
-
# Default: 1.0
|
52
|
-
[--typical-p=N] # locally typical sampling, parameter p
|
53
|
-
# Default: 1.0
|
54
|
-
[--temp=N] # temperature
|
55
|
-
# Default: 0.8
|
56
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
57
|
-
# Default: 0
|
58
|
-
|
59
|
-
Start chat
|
60
|
-
```
|
61
|
-
|
62
|
-
## embedding.rb
|
63
|
-
|
64
|
-
### Usage
|
65
|
-
|
66
|
-
```sh
|
67
|
-
$ cd examples
|
68
|
-
$ gem install llama_cpp thor
|
69
|
-
$ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
|
70
|
-
...
|
71
|
-
0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
|
72
|
-
```
|
73
|
-
|
74
|
-
### Options
|
75
|
-
|
76
|
-
```
|
77
|
-
$ ./embedding.rb help main
|
78
|
-
Usage:
|
79
|
-
embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
|
80
|
-
|
81
|
-
Options:
|
82
|
-
-s, [--seed=N] # random seed
|
83
|
-
# Default: -1
|
84
|
-
-t, [--threads=N] # number of threads
|
85
|
-
# Default: 2
|
86
|
-
-m, --model=MODEL # path to model file
|
87
|
-
-p, --prompt=PROMPT # prompt to generate embedding
|
88
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
89
|
-
# Default: 0
|
90
|
-
|
91
|
-
Extract embedding from prompt
|
92
|
-
```
|
data/examples/chat.rb
DELETED
@@ -1,198 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# chat.rb is a simple chatbot that uses llama_cpp to generate text.
|
5
|
-
# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
|
7
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
|
8
|
-
|
9
|
-
require 'llama_cpp'
|
10
|
-
require 'thor'
|
11
|
-
require 'readline'
|
12
|
-
require 'etc'
|
13
|
-
|
14
|
-
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
15
|
-
default_command :main
|
16
|
-
desc 'main', 'Start chat'
|
17
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
18
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
|
-
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
|
-
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
21
|
-
option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
|
22
|
-
option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
|
23
|
-
option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
|
24
|
-
option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
|
25
|
-
option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
|
26
|
-
option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
|
27
|
-
option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
|
28
|
-
option :top_k, type: :numeric, desc: 'top k sampling', default: 40
|
29
|
-
option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
|
30
|
-
option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
|
31
|
-
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
32
|
-
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
35
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
36
|
-
mdl_params = LLaMACpp::ModelParams.new
|
37
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
38
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
39
|
-
ctx_params = LLaMACpp::ContextParams.new
|
40
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
-
ctx_params.n_threads = options[:n_threads]
|
42
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
43
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
44
|
-
|
45
|
-
antiprompt = options[:reverse_prompt] || 'User:'
|
46
|
-
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
47
|
-
|
48
|
-
embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
|
49
|
-
|
50
|
-
n_ctx = context.n_ctx
|
51
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
52
|
-
|
53
|
-
n_keep = options[:keep]
|
54
|
-
n_keep = embd_input.size if n_keep > embd_input.size
|
55
|
-
|
56
|
-
last_n_tokens = [0] * n_ctx
|
57
|
-
interactive = true
|
58
|
-
is_interacting = false
|
59
|
-
input_echo = true
|
60
|
-
first_input = true
|
61
|
-
embd = []
|
62
|
-
n_consumed = 0
|
63
|
-
n_past = 0
|
64
|
-
n_remain = options[:n_predict]
|
65
|
-
n_vocab = context.model.n_vocab
|
66
|
-
|
67
|
-
while interactive
|
68
|
-
unless embd.empty?
|
69
|
-
if n_past + embd.size > n_ctx
|
70
|
-
n_left = n_past - n_keep
|
71
|
-
n_past = [1, n_keep].max
|
72
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
73
|
-
end
|
74
|
-
|
75
|
-
0.step(embd.size - 1, options[:batch_size]) do |i|
|
76
|
-
n_eval = [options[:batch_size], embd.size - i].min
|
77
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
78
|
-
n_past += n_eval
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
embd.clear
|
83
|
-
|
84
|
-
if embd_input.size <= n_consumed && !is_interacting
|
85
|
-
logits = context.logits
|
86
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
87
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
88
|
-
|
89
|
-
last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
|
90
|
-
context.sample_repetition_penalties(
|
91
|
-
candidates,
|
92
|
-
last_n_tokens[-last_n_repeat..],
|
93
|
-
penalty_repeat: options[:repeat_penalty],
|
94
|
-
penalty_freq: options[:frequency_penalty],
|
95
|
-
penalty_present: options[:presence_penalty]
|
96
|
-
)
|
97
|
-
|
98
|
-
context.sample_top_k(candidates, k: options[:top_k])
|
99
|
-
context.sample_tail_free(candidates, z: options[:tfs_z])
|
100
|
-
context.sample_typical(candidates, prob: options[:typical_p])
|
101
|
-
context.sample_top_p(candidates, prob: options[:top_p])
|
102
|
-
context.sample_temp(candidates, temp: options[:temp])
|
103
|
-
id = context.sample_token(candidates)
|
104
|
-
|
105
|
-
last_n_tokens.shift
|
106
|
-
last_n_tokens.push(id)
|
107
|
-
|
108
|
-
if id == context.model.token_eos
|
109
|
-
id = context.model.token_nl
|
110
|
-
unless antiprompt.empty?
|
111
|
-
first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
|
112
|
-
embd_input.concat(first_antiprompt)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
embd.push(id)
|
117
|
-
input_echo = true
|
118
|
-
n_remain -= 1
|
119
|
-
else
|
120
|
-
while embd_input.size > n_consumed
|
121
|
-
embd.push(embd_input[n_consumed])
|
122
|
-
last_n_tokens.shift
|
123
|
-
last_n_tokens.push(embd_input[n_consumed])
|
124
|
-
n_consumed += 1
|
125
|
-
break if embd.size >= options[:batch_size]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
if input_echo
|
130
|
-
output = embd.map { |token| context.model.token_to_piece(token) }
|
131
|
-
output_str = output.join
|
132
|
-
output_str.chomp!(antiprompt) if first_input
|
133
|
-
print(output_str)
|
134
|
-
end
|
135
|
-
|
136
|
-
if embd_input.size <= n_consumed
|
137
|
-
if antiprompt.size.positive?
|
138
|
-
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
139
|
-
last_output_str = last_output.join
|
140
|
-
|
141
|
-
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
142
|
-
unless last_output_str.index(antiprompt, search_start_pos).nil?
|
143
|
-
is_interacting = true
|
144
|
-
true
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
if n_past.positive? && is_interacting
|
149
|
-
if first_input
|
150
|
-
print("\r#{antiprompt}")
|
151
|
-
first_input = false
|
152
|
-
end
|
153
|
-
buffer = Readline.readline(' ')
|
154
|
-
break interactive = false if buffer.nil?
|
155
|
-
|
156
|
-
if buffer.size > 1
|
157
|
-
line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
|
158
|
-
embd_input.concat(line_input)
|
159
|
-
n_remain -= line_input.size
|
160
|
-
end
|
161
|
-
|
162
|
-
input_echo = false
|
163
|
-
end
|
164
|
-
|
165
|
-
is_interacting = false if n_past.positive?
|
166
|
-
end
|
167
|
-
|
168
|
-
if n_remain <= 0 && options[:n_predict] != -1
|
169
|
-
n_remain = options[:n_predict]
|
170
|
-
is_interacting = true
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def read_prompt(filename)
|
178
|
-
return if filename.nil?
|
179
|
-
|
180
|
-
File.read(filename).chomp
|
181
|
-
end
|
182
|
-
|
183
|
-
def default_prompt(antiprompt)
|
184
|
-
# Reference:
|
185
|
-
# https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
|
186
|
-
prompt = <<~MSG
|
187
|
-
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
188
|
-
|
189
|
-
User: Hello, Bob.
|
190
|
-
Bob: Hello. How may I help you today?
|
191
|
-
User: Please tell me the largest city in Europe.
|
192
|
-
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
193
|
-
MSG
|
194
|
-
prompt + antiprompt
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
Chat.start(ARGV)
|
data/examples/embedding.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# embedding.rb extracts embedding from prompt.
|
5
|
-
# It is created with reference to embedding.cpp in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
|
7
|
-
|
8
|
-
require 'llama_cpp'
|
9
|
-
require 'thor'
|
10
|
-
require 'etc'
|
11
|
-
|
12
|
-
class Embedding < Thor # rubocop:disable Style/Documentation
|
13
|
-
default_command :main
|
14
|
-
desc 'main', 'Extract embedding from prompt'
|
15
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
16
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
17
|
-
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
18
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
20
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
21
|
-
mdl_params = LLaMACpp::ModelParams.new
|
22
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
23
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
24
|
-
ctx_params = LLaMACpp::ContextParams.new
|
25
|
-
ctx_params.embedding = true
|
26
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
-
ctx_params.n_threads = options[:n_threads]
|
28
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
29
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
30
|
-
|
31
|
-
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
32
|
-
|
33
|
-
return unless embd_input.size.positive?
|
34
|
-
|
35
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
36
|
-
|
37
|
-
context.embeddings.each { |val| print("#{val} ") }
|
38
|
-
print("\n")
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
Embedding.start(ARGV)
|
data/examples/prompt_jp.txt
DELETED
data/examples/simple.rb
DELETED
@@ -1,96 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# simple.rb is a simple text completion script.
|
5
|
-
# It is created with reference to simple.cpp in llama.cpp examples:
|
6
|
-
# https://github.com/ggerganov/llama.cpp/blob/master/examples/simple/simple.cpp
|
7
|
-
|
8
|
-
require 'llama_cpp'
|
9
|
-
require 'thor'
|
10
|
-
require 'etc'
|
11
|
-
|
12
|
-
class Simple < Thor # rubocop:disable Style/Documentation
|
13
|
-
default_command :main
|
14
|
-
desc 'main', 'Simple completion'
|
15
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
16
|
-
option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
|
17
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
18
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
19
|
-
n_len = 32
|
20
|
-
model_params = LLaMACpp::ModelParams.new
|
21
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: model_params)
|
22
|
-
context_params = LLaMACpp::ContextParams.new
|
23
|
-
context_params.seed = 1234
|
24
|
-
context_params.n_ctx = 2048
|
25
|
-
context_params.logits_all = true
|
26
|
-
context_params.n_threads = options[:n_threads]
|
27
|
-
context_params.n_threads_batch = options[:n_threads]
|
28
|
-
context = LLaMACpp::Context.new(model: model, params: context_params)
|
29
|
-
|
30
|
-
tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
|
31
|
-
n_ctx = context.n_ctx
|
32
|
-
n_kv_req = tokens_list.size + (n_len - tokens_list.size)
|
33
|
-
raise 'n_kv_req > n_ctx, the required KV cache size is not big enough' if n_kv_req > n_ctx
|
34
|
-
|
35
|
-
print("\nmain: n_len = #{n_len}, n_ctx = #{n_ctx}, n_kv_req = #{n_kv_req}\n\n")
|
36
|
-
|
37
|
-
tokens_list.each { |token| print(context.model.token_to_piece(token)) }
|
38
|
-
|
39
|
-
batch = LLaMACpp::Batch.new(max_n_token: 512, n_embd: 0, max_n_seq: 1)
|
40
|
-
tokens_list.each_with_index do |token, id|
|
41
|
-
batch.set_token(batch.n_tokens, token)
|
42
|
-
batch.set_pos(batch.n_tokens, id)
|
43
|
-
batch.set_n_seq_id(batch.n_tokens, 1)
|
44
|
-
batch.set_seq_id(batch.n_tokens, 0, 0)
|
45
|
-
batch.set_logits(batch.n_tokens, false)
|
46
|
-
batch.n_tokens = batch.n_tokens + 1
|
47
|
-
end
|
48
|
-
|
49
|
-
batch.set_logits(batch.n_tokens - 1, true)
|
50
|
-
|
51
|
-
context.decode(batch)
|
52
|
-
|
53
|
-
n_cur = batch.n_tokens
|
54
|
-
n_decode = 0
|
55
|
-
n_vocab = context.model.n_vocab
|
56
|
-
|
57
|
-
t_start = Time.now
|
58
|
-
|
59
|
-
while n_cur <= n_len
|
60
|
-
logits = context.logits[((batch.n_tokens - 1) * n_vocab)..]
|
61
|
-
|
62
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i] || 0.0, p: 0.0) }
|
63
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
64
|
-
|
65
|
-
new_token_id = context.sample_token_greedy(candidates)
|
66
|
-
|
67
|
-
if new_token_id == context.model.token_eos || n_cur == n_len
|
68
|
-
print("\n")
|
69
|
-
break
|
70
|
-
end
|
71
|
-
|
72
|
-
print(context.model.token_to_piece(new_token_id))
|
73
|
-
|
74
|
-
batch.n_tokens = 0
|
75
|
-
|
76
|
-
batch.set_token(batch.n_tokens, new_token_id)
|
77
|
-
batch.set_pos(batch.n_tokens, n_cur)
|
78
|
-
batch.set_n_seq_id(batch.n_tokens, 1)
|
79
|
-
batch.set_seq_id(batch.n_tokens, 0, 0)
|
80
|
-
batch.set_logits(batch.n_tokens, true)
|
81
|
-
batch.n_tokens = batch.n_tokens + 1
|
82
|
-
|
83
|
-
n_decode += 1
|
84
|
-
n_cur += 1
|
85
|
-
context.decode(batch)
|
86
|
-
end
|
87
|
-
|
88
|
-
t_end = Time.now
|
89
|
-
|
90
|
-
print("\nmain: decoded #{n_decode} tokens in #{(t_end - t_start).floor(2)} s, speed: #{n_decode.fdiv(t_end - t_start).floor(2)} t/s\n\n")
|
91
|
-
|
92
|
-
LLaMACpp.backend_free
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
Simple.start(ARGV)
|