llama_cpp 0.17.10 → 0.18.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -29
- data/ext/llama_cpp/extconf.rb +0 -3
- data/ext/llama_cpp/llama_cpp.c +5157 -0
- data/ext/llama_cpp/llama_cpp.h +0 -5
- data/lib/llama_cpp/version.rb +3 -3
- data/lib/llama_cpp.rb +38 -83
- data/sig/llama_cpp.rbs +0 -59
- metadata +4 -12
- data/examples/README.md +0 -92
- data/examples/chat.rb +0 -198
- data/examples/embedding.rb +0 -42
- data/examples/prompt_jp.txt +0 -8
- data/examples/simple.rb +0 -96
- data/ext/llama_cpp/llama_cpp.cpp +0 -3764
data/ext/llama_cpp/llama_cpp.h
CHANGED
data/lib/llama_cpp/version.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
|
-
module
|
4
|
+
module LlamaCpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.18.0'
|
7
7
|
|
8
8
|
# The supported version of llama.cpp.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b4611'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
|
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
|
-
module
|
7
|
+
module LlamaCpp
|
8
8
|
module_function
|
9
9
|
|
10
10
|
# Generates sentences following the given prompt for operation check.
|
11
11
|
#
|
12
|
-
# @param context [
|
12
|
+
# @param context [LlamaCpp::LlamaContext] The context to use.
|
13
13
|
# @param prompt [String] The prompt to start generation with.
|
14
14
|
# @param n_predict [Integer] The number of tokens to predict.
|
15
|
-
# @param n_keep [Integer] The number of tokens to keep in the context.
|
16
|
-
# @param n_batch [Integer] The number of tokens to process in a batch.
|
17
|
-
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
18
|
-
# @param repeat_penalty [Float] The repetition penalty.
|
19
|
-
# @param frequency [Float] The frequency penalty.
|
20
|
-
# @param presence [Float] The presence penalty.
|
21
|
-
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
22
|
-
# @param top_p [Float] The probability threshold for nucleus sampling.
|
23
|
-
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
24
|
-
# @param typical_p [Float] The probability for typical sampling.
|
25
|
-
# @param temperature [Float] The temperature for temperature sampling.
|
26
15
|
# @return [String]
|
27
|
-
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/
|
28
|
-
|
29
|
-
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
30
|
-
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
31
|
-
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
16
|
+
def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
|
+
raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
|
32
18
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
33
19
|
|
34
|
-
|
35
|
-
|
20
|
+
model = LlamaCpp.llama_get_model(context)
|
21
|
+
vocab = LlamaCpp.llama_model_get_vocab(model)
|
36
22
|
|
37
|
-
|
38
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
23
|
+
n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
|
39
24
|
|
40
|
-
|
25
|
+
prompt_tokens = []
|
26
|
+
raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
|
27
|
+
true).negative?
|
41
28
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
ctx_params = LlamaCpp::LlamaContextParams.new
|
30
|
+
ctx_params.n_ctx = n_prompt + n_predict - 1
|
31
|
+
ctx_params.n_batch = n_prompt
|
32
|
+
ctx_params.no_perf = false
|
33
|
+
|
34
|
+
ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
|
35
|
+
|
36
|
+
sparams = LlamaCpp::LlamaSamplerChainParams.new
|
37
|
+
sparams.no_perf = false
|
38
|
+
smpl = LlamaCpp.llama_sampler_chain_init(sparams)
|
39
|
+
LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
|
40
|
+
|
41
|
+
batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
|
42
|
+
|
43
|
+
n_pos = 0
|
47
44
|
output = []
|
45
|
+
while n_pos + batch.n_tokens < n_prompt + n_predict
|
46
|
+
break if LlamaCpp.llama_decode(ctx, batch) != 0
|
47
|
+
|
48
|
+
n_pos += batch.n_tokens
|
49
|
+
|
50
|
+
new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
|
51
|
+
break if llama_vocab_is_eog?(vocab, new_token_id)
|
52
|
+
|
53
|
+
buf = llama_token_to_piece(vocab, new_token_id, 0, true)
|
54
|
+
output << buf
|
48
55
|
|
49
|
-
|
50
|
-
unless embd.empty?
|
51
|
-
if n_past + embd.size > n_ctx
|
52
|
-
n_left = n_past - n_keep
|
53
|
-
n_past = n_keep
|
54
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
|
-
end
|
56
|
-
|
57
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
|
-
end
|
59
|
-
|
60
|
-
n_past += embd.size
|
61
|
-
embd.clear
|
62
|
-
|
63
|
-
if embd_input.size <= n_consumed
|
64
|
-
logits = context.logits
|
65
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
66
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
67
|
-
|
68
|
-
# apply penalties
|
69
|
-
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.sample_repetition_penalties(
|
71
|
-
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
-
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
|
-
)
|
74
|
-
|
75
|
-
# temperature sampling
|
76
|
-
context.sample_top_k(candidates, k: top_k)
|
77
|
-
context.sample_tail_free(candidates, z: tfs_z)
|
78
|
-
context.sample_typical(candidates, prob: typical_p)
|
79
|
-
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.sample_temp(candidates, temp: temperature)
|
81
|
-
id = context.sample_token(candidates)
|
82
|
-
|
83
|
-
last_n_tokens.shift
|
84
|
-
last_n_tokens.push(id)
|
85
|
-
|
86
|
-
embd.push(id)
|
87
|
-
n_remain -= 1
|
88
|
-
else
|
89
|
-
while embd_input.size > n_consumed
|
90
|
-
embd.push(embd_input[n_consumed])
|
91
|
-
last_n_tokens.shift
|
92
|
-
last_n_tokens.push(embd_input[n_consumed])
|
93
|
-
n_consumed += 1
|
94
|
-
break if embd.size >= n_batch
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
|
-
|
100
|
-
break if !embd.empty? && embd[-1] == context.model.token_eos
|
56
|
+
batch = LlamaCpp.llama_batch_get_one([new_token_id])
|
101
57
|
end
|
102
58
|
|
103
|
-
output.join
|
59
|
+
output.join
|
104
60
|
end
|
105
61
|
end
|
106
62
|
|
107
|
-
LLaMACpp
|
108
|
-
at_exit { LLaMACpp.backend_free }
|
63
|
+
LLaMACpp = LlamaCpp
|
data/sig/llama_cpp.rbs
CHANGED
@@ -97,15 +97,6 @@ module LLaMACpp
|
|
97
97
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
98
98
|
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
99
99
|
|
100
|
-
LLAMA_GRETYPE_END: Integer
|
101
|
-
LLAMA_GRETYPE_ALT: Integer
|
102
|
-
LLAMA_GRETYPE_RULE_REF: Integer
|
103
|
-
LLAMA_GRETYPE_CHAR: Integer
|
104
|
-
LLAMA_GRETYPE_CHAR_NOT: Integer
|
105
|
-
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
106
|
-
LLAMA_GRETYPE_CHAR_ALT: Integer
|
107
|
-
LLAMA_GRETYPE_CHAR_ANY: Integer
|
108
|
-
|
109
100
|
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
110
101
|
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
111
102
|
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
@@ -205,20 +196,6 @@ module LLaMACpp
|
|
205
196
|
def detokenize: (Array[Integer], ?remove_special: bool, ?unparse_special: bool) -> String
|
206
197
|
end
|
207
198
|
|
208
|
-
class Timings
|
209
|
-
public
|
210
|
-
|
211
|
-
def t_start_ms: () -> Float
|
212
|
-
def t_end_ms: () -> Float
|
213
|
-
def t_load_ms: () -> Float
|
214
|
-
def t_sample_ms: () -> Float
|
215
|
-
def t_p_eval_ms: () -> Float
|
216
|
-
def t_eval_ms: () -> Float
|
217
|
-
def n_sample: () -> Integer
|
218
|
-
def n_p_eval: () -> Integer
|
219
|
-
def n_eval: () -> Integer
|
220
|
-
end
|
221
|
-
|
222
199
|
class ModelKVOverride
|
223
200
|
public
|
224
201
|
|
@@ -295,9 +272,6 @@ module LLaMACpp
|
|
295
272
|
def n_seq_max: () -> Integer
|
296
273
|
def n_threads: () -> Integer
|
297
274
|
def n_threads_batch: () -> Integer
|
298
|
-
def timings: () -> ::LLaMACpp::Timings
|
299
|
-
def print_timings: () -> void
|
300
|
-
def reset_timings: () -> void
|
301
275
|
def kv_cache_token_count: () -> Integer
|
302
276
|
def kv_cache_clear: () -> void
|
303
277
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
@@ -308,27 +282,10 @@ module LLaMACpp
|
|
308
282
|
def kv_cache_seq_pos_max: (Integer) -> Integer
|
309
283
|
def kv_cache_defrag: () -> void
|
310
284
|
def kv_cache_update: () -> void
|
311
|
-
def set_rng_seed: (Integer) -> void
|
312
285
|
def set_causal_attn: (bool) -> void
|
313
286
|
def synchronize: () -> void
|
314
287
|
def load_session_file: (session_path: String) -> void
|
315
288
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
316
|
-
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
317
|
-
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
318
|
-
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
319
|
-
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
320
|
-
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
321
|
-
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
322
|
-
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
323
|
-
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
324
|
-
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
325
|
-
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
326
|
-
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
327
|
-
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
328
|
-
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
329
|
-
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
330
|
-
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
331
|
-
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
332
289
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
333
290
|
def pooling_type: () -> Integer
|
334
291
|
end
|
@@ -336,8 +293,6 @@ module LLaMACpp
|
|
336
293
|
class ContextParams
|
337
294
|
public
|
338
295
|
|
339
|
-
def seed: () -> Integer
|
340
|
-
def seed=: (Integer) -> Integer
|
341
296
|
def n_ctx: () -> Integer
|
342
297
|
def n_ctx=: (Integer) -> Integer
|
343
298
|
def n_batch: () -> Integer
|
@@ -408,18 +363,4 @@ module LLaMACpp
|
|
408
363
|
end
|
409
364
|
|
410
365
|
class Params = ContextParams
|
411
|
-
|
412
|
-
class GrammarElement
|
413
|
-
public
|
414
|
-
|
415
|
-
def initialize: (?type: Integer, ?value: Integer) -> void
|
416
|
-
def type: () -> Integer
|
417
|
-
def type=: (Integer) -> Integer
|
418
|
-
def value: () -> Integer
|
419
|
-
def value=: (Integer) -> Integer
|
420
|
-
end
|
421
|
-
|
422
|
-
class Grammar
|
423
|
-
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
424
|
-
end
|
425
366
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-02-02 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
13
|
email:
|
@@ -22,13 +21,8 @@ files:
|
|
22
21
|
- CODE_OF_CONDUCT.md
|
23
22
|
- LICENSE.txt
|
24
23
|
- README.md
|
25
|
-
- examples/README.md
|
26
|
-
- examples/chat.rb
|
27
|
-
- examples/embedding.rb
|
28
|
-
- examples/prompt_jp.txt
|
29
|
-
- examples/simple.rb
|
30
24
|
- ext/llama_cpp/extconf.rb
|
31
|
-
- ext/llama_cpp/llama_cpp.
|
25
|
+
- ext/llama_cpp/llama_cpp.c
|
32
26
|
- ext/llama_cpp/llama_cpp.h
|
33
27
|
- lib/llama_cpp.rb
|
34
28
|
- lib/llama_cpp/version.rb
|
@@ -42,7 +36,6 @@ metadata:
|
|
42
36
|
changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
|
43
37
|
documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
|
44
38
|
rubygems_mfa_required: 'true'
|
45
|
-
post_install_message:
|
46
39
|
rdoc_options: []
|
47
40
|
require_paths:
|
48
41
|
- lib
|
@@ -57,8 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
50
|
- !ruby/object:Gem::Version
|
58
51
|
version: '0'
|
59
52
|
requirements: []
|
60
|
-
rubygems_version: 3.
|
61
|
-
signing_key:
|
53
|
+
rubygems_version: 3.6.2
|
62
54
|
specification_version: 4
|
63
55
|
summary: Ruby bindings for the llama.cpp.
|
64
56
|
test_files: []
|
data/examples/README.md
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
# llama_cpp.rb/examples
|
2
|
-
|
3
|
-
## chat.rb
|
4
|
-
|
5
|
-
### Usage
|
6
|
-
|
7
|
-
```sh
|
8
|
-
$ cd examples
|
9
|
-
$ gem install llama_cpp thor
|
10
|
-
$ ./chat.rb -m /path/to/quantized-model.bin -t 4
|
11
|
-
...
|
12
|
-
User: Please tell me the largest city in Japan.
|
13
|
-
Bob: Sure. The largest city in Japan is Tokyo.
|
14
|
-
User:
|
15
|
-
```
|
16
|
-
|
17
|
-
### Options
|
18
|
-
|
19
|
-
```sh
|
20
|
-
$ ./chat.rb help main
|
21
|
-
Usage:
|
22
|
-
chat.rb main -m, --model=MODEL
|
23
|
-
|
24
|
-
Options:
|
25
|
-
-s, [--seed=N] # random seed
|
26
|
-
# Default: -1
|
27
|
-
-t, [--threads=N] # number of threads
|
28
|
-
# Default: 2
|
29
|
-
-m, --model=MODEL # path to model file
|
30
|
-
-f, [--file=FILE] # prompt file to start generation
|
31
|
-
-r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
|
32
|
-
-b, [--batch-size=N] # batch size for prompt processing
|
33
|
-
# Default: 1024
|
34
|
-
-n, [--n-predict=N] # number of tokens to predict
|
35
|
-
# Default: 256
|
36
|
-
[--keep=N] # number of tokens to keep from the initial prompt
|
37
|
-
# Default: 48
|
38
|
-
[--repeat-last-n=N] # last n tokens to consider for penalize
|
39
|
-
# Default: 64
|
40
|
-
[--repeat-penalty=N] # penalize repeat sequence of tokens
|
41
|
-
# Default: 1.0
|
42
|
-
[--presence-penalty=N] # repeat alpha presence penalty
|
43
|
-
# Default: 0.0
|
44
|
-
[--frequency-penalty=N] # repeat alpha frequency penalty
|
45
|
-
# Default: 0.0
|
46
|
-
[--top-k=N] # top k sampling
|
47
|
-
# Default: 40
|
48
|
-
[--top-p=N] # top p sampling
|
49
|
-
# Default: 0.95
|
50
|
-
[--tfs-z=N] # tail free sampling, parameter z
|
51
|
-
# Default: 1.0
|
52
|
-
[--typical-p=N] # locally typical sampling, parameter p
|
53
|
-
# Default: 1.0
|
54
|
-
[--temp=N] # temperature
|
55
|
-
# Default: 0.8
|
56
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
57
|
-
# Default: 0
|
58
|
-
|
59
|
-
Start chat
|
60
|
-
```
|
61
|
-
|
62
|
-
## embedding.rb
|
63
|
-
|
64
|
-
### Usage
|
65
|
-
|
66
|
-
```sh
|
67
|
-
$ cd examples
|
68
|
-
$ gem install llama_cpp thor
|
69
|
-
$ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
|
70
|
-
...
|
71
|
-
0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
|
72
|
-
```
|
73
|
-
|
74
|
-
### Options
|
75
|
-
|
76
|
-
```
|
77
|
-
$ ./embedding.rb help main
|
78
|
-
Usage:
|
79
|
-
embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
|
80
|
-
|
81
|
-
Options:
|
82
|
-
-s, [--seed=N] # random seed
|
83
|
-
# Default: -1
|
84
|
-
-t, [--threads=N] # number of threads
|
85
|
-
# Default: 2
|
86
|
-
-m, --model=MODEL # path to model file
|
87
|
-
-p, --prompt=PROMPT # prompt to generate embedding
|
88
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
89
|
-
# Default: 0
|
90
|
-
|
91
|
-
Extract embedding from prompt
|
92
|
-
```
|
data/examples/chat.rb
DELETED
@@ -1,198 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# chat.rb is a simple chatbot that uses llama_cpp to generate text.
|
5
|
-
# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
|
7
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
|
8
|
-
|
9
|
-
require 'llama_cpp'
|
10
|
-
require 'thor'
|
11
|
-
require 'readline'
|
12
|
-
require 'etc'
|
13
|
-
|
14
|
-
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
15
|
-
default_command :main
|
16
|
-
desc 'main', 'Start chat'
|
17
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
18
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
|
-
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
|
-
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
21
|
-
option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
|
22
|
-
option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
|
23
|
-
option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
|
24
|
-
option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
|
25
|
-
option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
|
26
|
-
option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
|
27
|
-
option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
|
28
|
-
option :top_k, type: :numeric, desc: 'top k sampling', default: 40
|
29
|
-
option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
|
30
|
-
option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
|
31
|
-
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
32
|
-
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
35
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
36
|
-
mdl_params = LLaMACpp::ModelParams.new
|
37
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
38
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
39
|
-
ctx_params = LLaMACpp::ContextParams.new
|
40
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
-
ctx_params.n_threads = options[:n_threads]
|
42
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
43
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
44
|
-
|
45
|
-
antiprompt = options[:reverse_prompt] || 'User:'
|
46
|
-
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
47
|
-
|
48
|
-
embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
|
49
|
-
|
50
|
-
n_ctx = context.n_ctx
|
51
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
52
|
-
|
53
|
-
n_keep = options[:keep]
|
54
|
-
n_keep = embd_input.size if n_keep > embd_input.size
|
55
|
-
|
56
|
-
last_n_tokens = [0] * n_ctx
|
57
|
-
interactive = true
|
58
|
-
is_interacting = false
|
59
|
-
input_echo = true
|
60
|
-
first_input = true
|
61
|
-
embd = []
|
62
|
-
n_consumed = 0
|
63
|
-
n_past = 0
|
64
|
-
n_remain = options[:n_predict]
|
65
|
-
n_vocab = context.model.n_vocab
|
66
|
-
|
67
|
-
while interactive
|
68
|
-
unless embd.empty?
|
69
|
-
if n_past + embd.size > n_ctx
|
70
|
-
n_left = n_past - n_keep
|
71
|
-
n_past = [1, n_keep].max
|
72
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
73
|
-
end
|
74
|
-
|
75
|
-
0.step(embd.size - 1, options[:batch_size]) do |i|
|
76
|
-
n_eval = [options[:batch_size], embd.size - i].min
|
77
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
78
|
-
n_past += n_eval
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
embd.clear
|
83
|
-
|
84
|
-
if embd_input.size <= n_consumed && !is_interacting
|
85
|
-
logits = context.logits
|
86
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
87
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
88
|
-
|
89
|
-
last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
|
90
|
-
context.sample_repetition_penalties(
|
91
|
-
candidates,
|
92
|
-
last_n_tokens[-last_n_repeat..],
|
93
|
-
penalty_repeat: options[:repeat_penalty],
|
94
|
-
penalty_freq: options[:frequency_penalty],
|
95
|
-
penalty_present: options[:presence_penalty]
|
96
|
-
)
|
97
|
-
|
98
|
-
context.sample_top_k(candidates, k: options[:top_k])
|
99
|
-
context.sample_tail_free(candidates, z: options[:tfs_z])
|
100
|
-
context.sample_typical(candidates, prob: options[:typical_p])
|
101
|
-
context.sample_top_p(candidates, prob: options[:top_p])
|
102
|
-
context.sample_temp(candidates, temp: options[:temp])
|
103
|
-
id = context.sample_token(candidates)
|
104
|
-
|
105
|
-
last_n_tokens.shift
|
106
|
-
last_n_tokens.push(id)
|
107
|
-
|
108
|
-
if id == context.model.token_eos
|
109
|
-
id = context.model.token_nl
|
110
|
-
unless antiprompt.empty?
|
111
|
-
first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
|
112
|
-
embd_input.concat(first_antiprompt)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
embd.push(id)
|
117
|
-
input_echo = true
|
118
|
-
n_remain -= 1
|
119
|
-
else
|
120
|
-
while embd_input.size > n_consumed
|
121
|
-
embd.push(embd_input[n_consumed])
|
122
|
-
last_n_tokens.shift
|
123
|
-
last_n_tokens.push(embd_input[n_consumed])
|
124
|
-
n_consumed += 1
|
125
|
-
break if embd.size >= options[:batch_size]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
if input_echo
|
130
|
-
output = embd.map { |token| context.model.token_to_piece(token) }
|
131
|
-
output_str = output.join
|
132
|
-
output_str.chomp!(antiprompt) if first_input
|
133
|
-
print(output_str)
|
134
|
-
end
|
135
|
-
|
136
|
-
if embd_input.size <= n_consumed
|
137
|
-
if antiprompt.size.positive?
|
138
|
-
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
139
|
-
last_output_str = last_output.join
|
140
|
-
|
141
|
-
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
142
|
-
unless last_output_str.index(antiprompt, search_start_pos).nil?
|
143
|
-
is_interacting = true
|
144
|
-
true
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
if n_past.positive? && is_interacting
|
149
|
-
if first_input
|
150
|
-
print("\r#{antiprompt}")
|
151
|
-
first_input = false
|
152
|
-
end
|
153
|
-
buffer = Readline.readline(' ')
|
154
|
-
break interactive = false if buffer.nil?
|
155
|
-
|
156
|
-
if buffer.size > 1
|
157
|
-
line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
|
158
|
-
embd_input.concat(line_input)
|
159
|
-
n_remain -= line_input.size
|
160
|
-
end
|
161
|
-
|
162
|
-
input_echo = false
|
163
|
-
end
|
164
|
-
|
165
|
-
is_interacting = false if n_past.positive?
|
166
|
-
end
|
167
|
-
|
168
|
-
if n_remain <= 0 && options[:n_predict] != -1
|
169
|
-
n_remain = options[:n_predict]
|
170
|
-
is_interacting = true
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def read_prompt(filename)
|
178
|
-
return if filename.nil?
|
179
|
-
|
180
|
-
File.read(filename).chomp
|
181
|
-
end
|
182
|
-
|
183
|
-
def default_prompt(antiprompt)
|
184
|
-
# Reference:
|
185
|
-
# https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
|
186
|
-
prompt = <<~MSG
|
187
|
-
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
188
|
-
|
189
|
-
User: Hello, Bob.
|
190
|
-
Bob: Hello. How may I help you today?
|
191
|
-
User: Please tell me the largest city in Europe.
|
192
|
-
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
193
|
-
MSG
|
194
|
-
prompt + antiprompt
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
Chat.start(ARGV)
|
data/examples/embedding.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# embedding.rb extracts embedding from prompt.
|
5
|
-
# It is created with reference to embedding.cpp in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
|
7
|
-
|
8
|
-
require 'llama_cpp'
|
9
|
-
require 'thor'
|
10
|
-
require 'etc'
|
11
|
-
|
12
|
-
class Embedding < Thor # rubocop:disable Style/Documentation
|
13
|
-
default_command :main
|
14
|
-
desc 'main', 'Extract embedding from prompt'
|
15
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
16
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
17
|
-
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
18
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
20
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
21
|
-
mdl_params = LLaMACpp::ModelParams.new
|
22
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
23
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
24
|
-
ctx_params = LLaMACpp::ContextParams.new
|
25
|
-
ctx_params.embedding = true
|
26
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
-
ctx_params.n_threads = options[:n_threads]
|
28
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
29
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
30
|
-
|
31
|
-
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
32
|
-
|
33
|
-
return unless embd_input.size.positive?
|
34
|
-
|
35
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
36
|
-
|
37
|
-
context.embeddings.each { |val| print("#{val} ") }
|
38
|
-
print("\n")
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
Embedding.start(ARGV)
|