llama_cpp 0.17.9 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/LICENSE.txt +1 -1
- data/README.md +8 -29
- data/ext/llama_cpp/extconf.rb +0 -3
- data/ext/llama_cpp/llama_cpp.c +5157 -0
- data/ext/llama_cpp/llama_cpp.h +0 -5
- data/lib/llama_cpp/version.rb +3 -3
- data/lib/llama_cpp.rb +38 -83
- data/sig/llama_cpp.rbs +3 -59
- metadata +4 -12
- data/examples/README.md +0 -92
- data/examples/chat.rb +0 -198
- data/examples/embedding.rb +0 -42
- data/examples/prompt_jp.txt +0 -8
- data/examples/simple.rb +0 -96
- data/ext/llama_cpp/llama_cpp.cpp +0 -3761
data/ext/llama_cpp/llama_cpp.h
CHANGED
data/lib/llama_cpp/version.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
|
-
module
|
4
|
+
module LlamaCpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.18.0'
|
7
7
|
|
8
8
|
# The supported version of llama.cpp.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b4611'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
|
|
4
4
|
require_relative 'llama_cpp/llama_cpp'
|
5
5
|
|
6
6
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
7
|
-
module
|
7
|
+
module LlamaCpp
|
8
8
|
module_function
|
9
9
|
|
10
10
|
# Generates sentences following the given prompt for operation check.
|
11
11
|
#
|
12
|
-
# @param context [
|
12
|
+
# @param context [LlamaCpp::LlamaContext] The context to use.
|
13
13
|
# @param prompt [String] The prompt to start generation with.
|
14
14
|
# @param n_predict [Integer] The number of tokens to predict.
|
15
|
-
# @param n_keep [Integer] The number of tokens to keep in the context.
|
16
|
-
# @param n_batch [Integer] The number of tokens to process in a batch.
|
17
|
-
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
18
|
-
# @param repeat_penalty [Float] The repetition penalty.
|
19
|
-
# @param frequency [Float] The frequency penalty.
|
20
|
-
# @param presence [Float] The presence penalty.
|
21
|
-
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
22
|
-
# @param top_p [Float] The probability threshold for nucleus sampling.
|
23
|
-
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
24
|
-
# @param typical_p [Float] The probability for typical sampling.
|
25
|
-
# @param temperature [Float] The temperature for temperature sampling.
|
26
15
|
# @return [String]
|
27
|
-
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/
|
28
|
-
|
29
|
-
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
30
|
-
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
31
|
-
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
16
|
+
def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
17
|
+
raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
|
32
18
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
33
19
|
|
34
|
-
|
35
|
-
|
20
|
+
model = LlamaCpp.llama_get_model(context)
|
21
|
+
vocab = LlamaCpp.llama_model_get_vocab(model)
|
36
22
|
|
37
|
-
|
38
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
23
|
+
n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
|
39
24
|
|
40
|
-
|
25
|
+
prompt_tokens = []
|
26
|
+
raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
|
27
|
+
true).negative?
|
41
28
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
29
|
+
ctx_params = LlamaCpp::LlamaContextParams.new
|
30
|
+
ctx_params.n_ctx = n_prompt + n_predict - 1
|
31
|
+
ctx_params.n_batch = n_prompt
|
32
|
+
ctx_params.no_perf = false
|
33
|
+
|
34
|
+
ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
|
35
|
+
|
36
|
+
sparams = LlamaCpp::LlamaSamplerChainParams.new
|
37
|
+
sparams.no_perf = false
|
38
|
+
smpl = LlamaCpp.llama_sampler_chain_init(sparams)
|
39
|
+
LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
|
40
|
+
|
41
|
+
batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
|
42
|
+
|
43
|
+
n_pos = 0
|
47
44
|
output = []
|
45
|
+
while n_pos + batch.n_tokens < n_prompt + n_predict
|
46
|
+
break if LlamaCpp.llama_decode(ctx, batch) != 0
|
47
|
+
|
48
|
+
n_pos += batch.n_tokens
|
49
|
+
|
50
|
+
new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
|
51
|
+
break if llama_vocab_is_eog?(vocab, new_token_id)
|
52
|
+
|
53
|
+
buf = llama_token_to_piece(vocab, new_token_id, 0, true)
|
54
|
+
output << buf
|
48
55
|
|
49
|
-
|
50
|
-
unless embd.empty?
|
51
|
-
if n_past + embd.size > n_ctx
|
52
|
-
n_left = n_past - n_keep
|
53
|
-
n_past = n_keep
|
54
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
55
|
-
end
|
56
|
-
|
57
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
|
58
|
-
end
|
59
|
-
|
60
|
-
n_past += embd.size
|
61
|
-
embd.clear
|
62
|
-
|
63
|
-
if embd_input.size <= n_consumed
|
64
|
-
logits = context.logits
|
65
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
66
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
67
|
-
|
68
|
-
# apply penalties
|
69
|
-
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.sample_repetition_penalties(
|
71
|
-
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
-
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
|
-
)
|
74
|
-
|
75
|
-
# temperature sampling
|
76
|
-
context.sample_top_k(candidates, k: top_k)
|
77
|
-
context.sample_tail_free(candidates, z: tfs_z)
|
78
|
-
context.sample_typical(candidates, prob: typical_p)
|
79
|
-
context.sample_top_p(candidates, prob: top_p)
|
80
|
-
context.sample_temp(candidates, temp: temperature)
|
81
|
-
id = context.sample_token(candidates)
|
82
|
-
|
83
|
-
last_n_tokens.shift
|
84
|
-
last_n_tokens.push(id)
|
85
|
-
|
86
|
-
embd.push(id)
|
87
|
-
n_remain -= 1
|
88
|
-
else
|
89
|
-
while embd_input.size > n_consumed
|
90
|
-
embd.push(embd_input[n_consumed])
|
91
|
-
last_n_tokens.shift
|
92
|
-
last_n_tokens.push(embd_input[n_consumed])
|
93
|
-
n_consumed += 1
|
94
|
-
break if embd.size >= n_batch
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
|
-
|
100
|
-
break if !embd.empty? && embd[-1] == context.model.token_eos
|
56
|
+
batch = LlamaCpp.llama_batch_get_one([new_token_id])
|
101
57
|
end
|
102
58
|
|
103
|
-
output.join
|
59
|
+
output.join
|
104
60
|
end
|
105
61
|
end
|
106
62
|
|
107
|
-
LLaMACpp
|
108
|
-
at_exit { LLaMACpp.backend_free }
|
63
|
+
LLaMACpp = LlamaCpp
|
data/sig/llama_cpp.rbs
CHANGED
@@ -16,6 +16,7 @@ module LLaMACpp
|
|
16
16
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
17
17
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
18
18
|
LLAMA_VOCAB_TYPE_UGM: Integer
|
19
|
+
LLAMA_VOCAB_TYPE_RWKV: Integer
|
19
20
|
|
20
21
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
|
21
22
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
|
@@ -87,6 +88,8 @@ module LLaMACpp
|
|
87
88
|
LLAMA_FTYPE_MOSTLY_Q4_0_4_4: Integer
|
88
89
|
LLAMA_FTYPE_MOSTLY_Q4_0_4_8: Integer
|
89
90
|
LLAMA_FTYPE_MOSTLY_Q4_0_8_8: Integer
|
91
|
+
LLAMA_FTYPE_MOSTLY_TQ1_0: Integer
|
92
|
+
LLAMA_FTYPE_MOSTLY_TQ2_0: Integer
|
90
93
|
LLAMA_FTYPE_GUESSED: Integer
|
91
94
|
|
92
95
|
LLAMA_KV_OVERRIDE_TYPE_INT: Integer
|
@@ -94,15 +97,6 @@ module LLaMACpp
|
|
94
97
|
LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
|
95
98
|
LLAMA_KV_OVERRIDE_TYPE_STR: Integer
|
96
99
|
|
97
|
-
LLAMA_GRETYPE_END: Integer
|
98
|
-
LLAMA_GRETYPE_ALT: Integer
|
99
|
-
LLAMA_GRETYPE_RULE_REF: Integer
|
100
|
-
LLAMA_GRETYPE_CHAR: Integer
|
101
|
-
LLAMA_GRETYPE_CHAR_NOT: Integer
|
102
|
-
LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
|
103
|
-
LLAMA_GRETYPE_CHAR_ALT: Integer
|
104
|
-
LLAMA_GRETYPE_CHAR_ANY: Integer
|
105
|
-
|
106
100
|
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
|
107
101
|
LLAMA_ROPE_SCALING_TYPE_NONE: Integer
|
108
102
|
LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
|
@@ -202,20 +196,6 @@ module LLaMACpp
|
|
202
196
|
def detokenize: (Array[Integer], ?remove_special: bool, ?unparse_special: bool) -> String
|
203
197
|
end
|
204
198
|
|
205
|
-
class Timings
|
206
|
-
public
|
207
|
-
|
208
|
-
def t_start_ms: () -> Float
|
209
|
-
def t_end_ms: () -> Float
|
210
|
-
def t_load_ms: () -> Float
|
211
|
-
def t_sample_ms: () -> Float
|
212
|
-
def t_p_eval_ms: () -> Float
|
213
|
-
def t_eval_ms: () -> Float
|
214
|
-
def n_sample: () -> Integer
|
215
|
-
def n_p_eval: () -> Integer
|
216
|
-
def n_eval: () -> Integer
|
217
|
-
end
|
218
|
-
|
219
199
|
class ModelKVOverride
|
220
200
|
public
|
221
201
|
|
@@ -292,9 +272,6 @@ module LLaMACpp
|
|
292
272
|
def n_seq_max: () -> Integer
|
293
273
|
def n_threads: () -> Integer
|
294
274
|
def n_threads_batch: () -> Integer
|
295
|
-
def timings: () -> ::LLaMACpp::Timings
|
296
|
-
def print_timings: () -> void
|
297
|
-
def reset_timings: () -> void
|
298
275
|
def kv_cache_token_count: () -> Integer
|
299
276
|
def kv_cache_clear: () -> void
|
300
277
|
def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
|
@@ -305,27 +282,10 @@ module LLaMACpp
|
|
305
282
|
def kv_cache_seq_pos_max: (Integer) -> Integer
|
306
283
|
def kv_cache_defrag: () -> void
|
307
284
|
def kv_cache_update: () -> void
|
308
|
-
def set_rng_seed: (Integer) -> void
|
309
285
|
def set_causal_attn: (bool) -> void
|
310
286
|
def synchronize: () -> void
|
311
287
|
def load_session_file: (session_path: String) -> void
|
312
288
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
313
|
-
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
314
|
-
def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
|
315
|
-
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
316
|
-
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
317
|
-
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
318
|
-
def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
319
|
-
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
320
|
-
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
321
|
-
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
322
|
-
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
323
|
-
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
324
|
-
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
325
|
-
def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
|
326
|
-
def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
|
327
|
-
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
328
|
-
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
329
289
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
330
290
|
def pooling_type: () -> Integer
|
331
291
|
end
|
@@ -333,8 +293,6 @@ module LLaMACpp
|
|
333
293
|
class ContextParams
|
334
294
|
public
|
335
295
|
|
336
|
-
def seed: () -> Integer
|
337
|
-
def seed=: (Integer) -> Integer
|
338
296
|
def n_ctx: () -> Integer
|
339
297
|
def n_ctx=: (Integer) -> Integer
|
340
298
|
def n_batch: () -> Integer
|
@@ -405,18 +363,4 @@ module LLaMACpp
|
|
405
363
|
end
|
406
364
|
|
407
365
|
class Params = ContextParams
|
408
|
-
|
409
|
-
class GrammarElement
|
410
|
-
public
|
411
|
-
|
412
|
-
def initialize: (?type: Integer, ?value: Integer) -> void
|
413
|
-
def type: () -> Integer
|
414
|
-
def type=: (Integer) -> Integer
|
415
|
-
def value: () -> Integer
|
416
|
-
def value=: (Integer) -> Integer
|
417
|
-
end
|
418
|
-
|
419
|
-
class Grammar
|
420
|
-
def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
|
421
|
-
end
|
422
366
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-02-02 00:00:00.000000000 Z
|
12
11
|
dependencies: []
|
13
12
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
13
|
email:
|
@@ -22,13 +21,8 @@ files:
|
|
22
21
|
- CODE_OF_CONDUCT.md
|
23
22
|
- LICENSE.txt
|
24
23
|
- README.md
|
25
|
-
- examples/README.md
|
26
|
-
- examples/chat.rb
|
27
|
-
- examples/embedding.rb
|
28
|
-
- examples/prompt_jp.txt
|
29
|
-
- examples/simple.rb
|
30
24
|
- ext/llama_cpp/extconf.rb
|
31
|
-
- ext/llama_cpp/llama_cpp.
|
25
|
+
- ext/llama_cpp/llama_cpp.c
|
32
26
|
- ext/llama_cpp/llama_cpp.h
|
33
27
|
- lib/llama_cpp.rb
|
34
28
|
- lib/llama_cpp/version.rb
|
@@ -42,7 +36,6 @@ metadata:
|
|
42
36
|
changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
|
43
37
|
documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
|
44
38
|
rubygems_mfa_required: 'true'
|
45
|
-
post_install_message:
|
46
39
|
rdoc_options: []
|
47
40
|
require_paths:
|
48
41
|
- lib
|
@@ -57,8 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
50
|
- !ruby/object:Gem::Version
|
58
51
|
version: '0'
|
59
52
|
requirements: []
|
60
|
-
rubygems_version: 3.
|
61
|
-
signing_key:
|
53
|
+
rubygems_version: 3.6.2
|
62
54
|
specification_version: 4
|
63
55
|
summary: Ruby bindings for the llama.cpp.
|
64
56
|
test_files: []
|
data/examples/README.md
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
# llama_cpp.rb/examples
|
2
|
-
|
3
|
-
## chat.rb
|
4
|
-
|
5
|
-
### Usage
|
6
|
-
|
7
|
-
```sh
|
8
|
-
$ cd examples
|
9
|
-
$ gem install llama_cpp thor
|
10
|
-
$ ./chat.rb -m /path/to/quantized-model.bin -t 4
|
11
|
-
...
|
12
|
-
User: Please tell me the largest city in Japan.
|
13
|
-
Bob: Sure. The largest city in Japan is Tokyo.
|
14
|
-
User:
|
15
|
-
```
|
16
|
-
|
17
|
-
### Options
|
18
|
-
|
19
|
-
```sh
|
20
|
-
$ ./chat.rb help main
|
21
|
-
Usage:
|
22
|
-
chat.rb main -m, --model=MODEL
|
23
|
-
|
24
|
-
Options:
|
25
|
-
-s, [--seed=N] # random seed
|
26
|
-
# Default: -1
|
27
|
-
-t, [--threads=N] # number of threads
|
28
|
-
# Default: 2
|
29
|
-
-m, --model=MODEL # path to model file
|
30
|
-
-f, [--file=FILE] # prompt file to start generation
|
31
|
-
-r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
|
32
|
-
-b, [--batch-size=N] # batch size for prompt processing
|
33
|
-
# Default: 1024
|
34
|
-
-n, [--n-predict=N] # number of tokens to predict
|
35
|
-
# Default: 256
|
36
|
-
[--keep=N] # number of tokens to keep from the initial prompt
|
37
|
-
# Default: 48
|
38
|
-
[--repeat-last-n=N] # last n tokens to consider for penalize
|
39
|
-
# Default: 64
|
40
|
-
[--repeat-penalty=N] # penalize repeat sequence of tokens
|
41
|
-
# Default: 1.0
|
42
|
-
[--presence-penalty=N] # repeat alpha presence penalty
|
43
|
-
# Default: 0.0
|
44
|
-
[--frequency-penalty=N] # repeat alpha frequency penalty
|
45
|
-
# Default: 0.0
|
46
|
-
[--top-k=N] # top k sampling
|
47
|
-
# Default: 40
|
48
|
-
[--top-p=N] # top p sampling
|
49
|
-
# Default: 0.95
|
50
|
-
[--tfs-z=N] # tail free sampling, parameter z
|
51
|
-
# Default: 1.0
|
52
|
-
[--typical-p=N] # locally typical sampling, parameter p
|
53
|
-
# Default: 1.0
|
54
|
-
[--temp=N] # temperature
|
55
|
-
# Default: 0.8
|
56
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
57
|
-
# Default: 0
|
58
|
-
|
59
|
-
Start chat
|
60
|
-
```
|
61
|
-
|
62
|
-
## embedding.rb
|
63
|
-
|
64
|
-
### Usage
|
65
|
-
|
66
|
-
```sh
|
67
|
-
$ cd examples
|
68
|
-
$ gem install llama_cpp thor
|
69
|
-
$ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
|
70
|
-
...
|
71
|
-
0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
|
72
|
-
```
|
73
|
-
|
74
|
-
### Options
|
75
|
-
|
76
|
-
```
|
77
|
-
$ ./embedding.rb help main
|
78
|
-
Usage:
|
79
|
-
embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
|
80
|
-
|
81
|
-
Options:
|
82
|
-
-s, [--seed=N] # random seed
|
83
|
-
# Default: -1
|
84
|
-
-t, [--threads=N] # number of threads
|
85
|
-
# Default: 2
|
86
|
-
-m, --model=MODEL # path to model file
|
87
|
-
-p, --prompt=PROMPT # prompt to generate embedding
|
88
|
-
[--n-gpu-layers=N] # number of layers on GPU
|
89
|
-
# Default: 0
|
90
|
-
|
91
|
-
Extract embedding from prompt
|
92
|
-
```
|
data/examples/chat.rb
DELETED
@@ -1,198 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# chat.rb is a simple chatbot that uses llama_cpp to generate text.
|
5
|
-
# It is created with reference to main.cpp and chat.sh in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
|
7
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
|
8
|
-
|
9
|
-
require 'llama_cpp'
|
10
|
-
require 'thor'
|
11
|
-
require 'readline'
|
12
|
-
require 'etc'
|
13
|
-
|
14
|
-
class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
15
|
-
default_command :main
|
16
|
-
desc 'main', 'Start chat'
|
17
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
18
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
19
|
-
option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
|
20
|
-
option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
|
21
|
-
option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
|
22
|
-
option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
|
23
|
-
option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
|
24
|
-
option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
|
25
|
-
option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
|
26
|
-
option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
|
27
|
-
option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
|
28
|
-
option :top_k, type: :numeric, desc: 'top k sampling', default: 40
|
29
|
-
option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
|
30
|
-
option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
|
31
|
-
option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
|
32
|
-
option :temp, type: :numeric, desc: 'temperature', default: 0.8
|
33
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
34
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
35
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
36
|
-
mdl_params = LLaMACpp::ModelParams.new
|
37
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
38
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
39
|
-
ctx_params = LLaMACpp::ContextParams.new
|
40
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
41
|
-
ctx_params.n_threads = options[:n_threads]
|
42
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
43
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
44
|
-
|
45
|
-
antiprompt = options[:reverse_prompt] || 'User:'
|
46
|
-
start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
|
47
|
-
|
48
|
-
embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
|
49
|
-
|
50
|
-
n_ctx = context.n_ctx
|
51
|
-
raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
|
52
|
-
|
53
|
-
n_keep = options[:keep]
|
54
|
-
n_keep = embd_input.size if n_keep > embd_input.size
|
55
|
-
|
56
|
-
last_n_tokens = [0] * n_ctx
|
57
|
-
interactive = true
|
58
|
-
is_interacting = false
|
59
|
-
input_echo = true
|
60
|
-
first_input = true
|
61
|
-
embd = []
|
62
|
-
n_consumed = 0
|
63
|
-
n_past = 0
|
64
|
-
n_remain = options[:n_predict]
|
65
|
-
n_vocab = context.model.n_vocab
|
66
|
-
|
67
|
-
while interactive
|
68
|
-
unless embd.empty?
|
69
|
-
if n_past + embd.size > n_ctx
|
70
|
-
n_left = n_past - n_keep
|
71
|
-
n_past = [1, n_keep].max
|
72
|
-
embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
|
73
|
-
end
|
74
|
-
|
75
|
-
0.step(embd.size - 1, options[:batch_size]) do |i|
|
76
|
-
n_eval = [options[:batch_size], embd.size - i].min
|
77
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
|
78
|
-
n_past += n_eval
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
embd.clear
|
83
|
-
|
84
|
-
if embd_input.size <= n_consumed && !is_interacting
|
85
|
-
logits = context.logits
|
86
|
-
base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
|
87
|
-
candidates = LLaMACpp::TokenDataArray.new(base_candidates)
|
88
|
-
|
89
|
-
last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
|
90
|
-
context.sample_repetition_penalties(
|
91
|
-
candidates,
|
92
|
-
last_n_tokens[-last_n_repeat..],
|
93
|
-
penalty_repeat: options[:repeat_penalty],
|
94
|
-
penalty_freq: options[:frequency_penalty],
|
95
|
-
penalty_present: options[:presence_penalty]
|
96
|
-
)
|
97
|
-
|
98
|
-
context.sample_top_k(candidates, k: options[:top_k])
|
99
|
-
context.sample_tail_free(candidates, z: options[:tfs_z])
|
100
|
-
context.sample_typical(candidates, prob: options[:typical_p])
|
101
|
-
context.sample_top_p(candidates, prob: options[:top_p])
|
102
|
-
context.sample_temp(candidates, temp: options[:temp])
|
103
|
-
id = context.sample_token(candidates)
|
104
|
-
|
105
|
-
last_n_tokens.shift
|
106
|
-
last_n_tokens.push(id)
|
107
|
-
|
108
|
-
if id == context.model.token_eos
|
109
|
-
id = context.model.token_nl
|
110
|
-
unless antiprompt.empty?
|
111
|
-
first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
|
112
|
-
embd_input.concat(first_antiprompt)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
embd.push(id)
|
117
|
-
input_echo = true
|
118
|
-
n_remain -= 1
|
119
|
-
else
|
120
|
-
while embd_input.size > n_consumed
|
121
|
-
embd.push(embd_input[n_consumed])
|
122
|
-
last_n_tokens.shift
|
123
|
-
last_n_tokens.push(embd_input[n_consumed])
|
124
|
-
n_consumed += 1
|
125
|
-
break if embd.size >= options[:batch_size]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
if input_echo
|
130
|
-
output = embd.map { |token| context.model.token_to_piece(token) }
|
131
|
-
output_str = output.join
|
132
|
-
output_str.chomp!(antiprompt) if first_input
|
133
|
-
print(output_str)
|
134
|
-
end
|
135
|
-
|
136
|
-
if embd_input.size <= n_consumed
|
137
|
-
if antiprompt.size.positive?
|
138
|
-
last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
|
139
|
-
last_output_str = last_output.join
|
140
|
-
|
141
|
-
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
142
|
-
unless last_output_str.index(antiprompt, search_start_pos).nil?
|
143
|
-
is_interacting = true
|
144
|
-
true
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
if n_past.positive? && is_interacting
|
149
|
-
if first_input
|
150
|
-
print("\r#{antiprompt}")
|
151
|
-
first_input = false
|
152
|
-
end
|
153
|
-
buffer = Readline.readline(' ')
|
154
|
-
break interactive = false if buffer.nil?
|
155
|
-
|
156
|
-
if buffer.size > 1
|
157
|
-
line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
|
158
|
-
embd_input.concat(line_input)
|
159
|
-
n_remain -= line_input.size
|
160
|
-
end
|
161
|
-
|
162
|
-
input_echo = false
|
163
|
-
end
|
164
|
-
|
165
|
-
is_interacting = false if n_past.positive?
|
166
|
-
end
|
167
|
-
|
168
|
-
if n_remain <= 0 && options[:n_predict] != -1
|
169
|
-
n_remain = options[:n_predict]
|
170
|
-
is_interacting = true
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def read_prompt(filename)
|
178
|
-
return if filename.nil?
|
179
|
-
|
180
|
-
File.read(filename).chomp
|
181
|
-
end
|
182
|
-
|
183
|
-
def default_prompt(antiprompt)
|
184
|
-
# Reference:
|
185
|
-
# https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
|
186
|
-
prompt = <<~MSG
|
187
|
-
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
188
|
-
|
189
|
-
User: Hello, Bob.
|
190
|
-
Bob: Hello. How may I help you today?
|
191
|
-
User: Please tell me the largest city in Europe.
|
192
|
-
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
193
|
-
MSG
|
194
|
-
prompt + antiprompt
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
Chat.start(ARGV)
|
data/examples/embedding.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
# embedding.rb extracts embedding from prompt.
|
5
|
-
# It is created with reference to embedding.cpp in llama.cpp examples:
|
6
|
-
# - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
|
7
|
-
|
8
|
-
require 'llama_cpp'
|
9
|
-
require 'thor'
|
10
|
-
require 'etc'
|
11
|
-
|
12
|
-
class Embedding < Thor # rubocop:disable Style/Documentation
|
13
|
-
default_command :main
|
14
|
-
desc 'main', 'Extract embedding from prompt'
|
15
|
-
option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
|
16
|
-
option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
|
17
|
-
option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
|
18
|
-
option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
|
19
|
-
option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
|
20
|
-
def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
21
|
-
mdl_params = LLaMACpp::ModelParams.new
|
22
|
-
mdl_params.n_gpu_layers = options[:n_gpu_layers]
|
23
|
-
model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
|
24
|
-
ctx_params = LLaMACpp::ContextParams.new
|
25
|
-
ctx_params.embedding = true
|
26
|
-
ctx_params.seed = options[:seed] if options[:seed] != -1
|
27
|
-
ctx_params.n_threads = options[:n_threads]
|
28
|
-
ctx_params.n_threads_batch = options[:n_threads]
|
29
|
-
context = LLaMACpp::Context.new(model: model, params: ctx_params)
|
30
|
-
|
31
|
-
embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
|
32
|
-
|
33
|
-
return unless embd_input.size.positive?
|
34
|
-
|
35
|
-
context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
|
36
|
-
|
37
|
-
context.embeddings.each { |val| print("#{val} ") }
|
38
|
-
print("\n")
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
Embedding.start(ARGV)
|