llama_cpp 0.17.10 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,6 @@
1
1
  #ifndef LLAMA_CPP_RB_H
2
2
  #define LLAMA_CPP_RB_H 1
3
3
 
4
- #include <algorithm>
5
- #include <sstream>
6
- #include <string>
7
- #include <vector>
8
-
9
4
  #include <llama.h>
10
5
 
11
6
  #include <ruby.h>
@@ -1,10 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
- module LLaMACpp
4
+ module LlamaCpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.17.10'
6
+ VERSION = '0.18.0'
7
7
 
8
8
  # The supported version of llama.cpp.
9
- LLAMA_CPP_VERSION = 'b3676'
9
+ LLAMA_CPP_VERSION = 'b4611'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
5
 
6
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
- module LLaMACpp
7
+ module LlamaCpp
8
8
  module_function
9
9
 
10
10
  # Generates sentences following the given prompt for operation check.
11
11
  #
12
- # @param context [LLaMACpp::Context] The context to use.
12
+ # @param context [LlamaCpp::LlamaContext] The context to use.
13
13
  # @param prompt [String] The prompt to start generation with.
14
14
  # @param n_predict [Integer] The number of tokens to predict.
15
- # @param n_keep [Integer] The number of tokens to keep in the context.
16
- # @param n_batch [Integer] The number of tokens to process in a batch.
17
- # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
18
- # @param repeat_penalty [Float] The repetition penalty.
19
- # @param frequency [Float] The frequency penalty.
20
- # @param presence [Float] The presence penalty.
21
- # @param top_k [Integer] The number of tokens to consider for top-k sampling.
22
- # @param top_p [Float] The probability threshold for nucleus sampling.
23
- # @param tfs_z [Float] The z parameter for tail-free sampling.
24
- # @param typical_p [Float] The probability for typical sampling.
25
- # @param temperature [Float] The temperature for temperature sampling.
26
15
  # @return [String]
27
- def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
28
- n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
29
- repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
30
- top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
31
- raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
16
+ def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
17
+ raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
32
18
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
33
19
 
34
- spaced_prompt = " #{prompt}"
35
- embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
20
+ model = LlamaCpp.llama_get_model(context)
21
+ vocab = LlamaCpp.llama_model_get_vocab(model)
36
22
 
37
- n_ctx = context.n_ctx
38
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
23
+ n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
39
24
 
40
- last_n_tokens = [0] * n_ctx
25
+ prompt_tokens = []
26
+ raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
27
+ true).negative?
41
28
 
42
- embd = []
43
- n_consumed = 0
44
- n_past = 0
45
- n_remain = n_predict
46
- n_vocab = context.model.n_vocab
29
+ ctx_params = LlamaCpp::LlamaContextParams.new
30
+ ctx_params.n_ctx = n_prompt + n_predict - 1
31
+ ctx_params.n_batch = n_prompt
32
+ ctx_params.no_perf = false
33
+
34
+ ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
35
+
36
+ sparams = LlamaCpp::LlamaSamplerChainParams.new
37
+ sparams.no_perf = false
38
+ smpl = LlamaCpp.llama_sampler_chain_init(sparams)
39
+ LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
40
+
41
+ batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
42
+
43
+ n_pos = 0
47
44
  output = []
45
+ while n_pos + batch.n_tokens < n_prompt + n_predict
46
+ break if LlamaCpp.llama_decode(ctx, batch) != 0
47
+
48
+ n_pos += batch.n_tokens
49
+
50
+ new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
51
+ break if llama_vocab_is_eog?(vocab, new_token_id)
52
+
53
+ buf = llama_token_to_piece(vocab, new_token_id, 0, true)
54
+ output << buf
48
55
 
49
- while n_remain != 0
50
- unless embd.empty?
51
- if n_past + embd.size > n_ctx
52
- n_left = n_past - n_keep
53
- n_past = n_keep
54
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
55
- end
56
-
57
- context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
58
- end
59
-
60
- n_past += embd.size
61
- embd.clear
62
-
63
- if embd_input.size <= n_consumed
64
- logits = context.logits
65
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
66
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
67
-
68
- # apply penalties
69
- last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalties(
71
- candidates, last_n_tokens[-last_n_repeat..],
72
- penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
- )
74
-
75
- # temperature sampling
76
- context.sample_top_k(candidates, k: top_k)
77
- context.sample_tail_free(candidates, z: tfs_z)
78
- context.sample_typical(candidates, prob: typical_p)
79
- context.sample_top_p(candidates, prob: top_p)
80
- context.sample_temp(candidates, temp: temperature)
81
- id = context.sample_token(candidates)
82
-
83
- last_n_tokens.shift
84
- last_n_tokens.push(id)
85
-
86
- embd.push(id)
87
- n_remain -= 1
88
- else
89
- while embd_input.size > n_consumed
90
- embd.push(embd_input[n_consumed])
91
- last_n_tokens.shift
92
- last_n_tokens.push(embd_input[n_consumed])
93
- n_consumed += 1
94
- break if embd.size >= n_batch
95
- end
96
- end
97
-
98
- embd.each { |token| output << context.model.token_to_piece(token) }
99
-
100
- break if !embd.empty? && embd[-1] == context.model.token_eos
56
+ batch = LlamaCpp.llama_batch_get_one([new_token_id])
101
57
  end
102
58
 
103
- output.join.scrub('?').strip.delete_prefix(prompt).strip
59
+ output.join
104
60
  end
105
61
  end
106
62
 
107
- LLaMACpp.backend_init
108
- at_exit { LLaMACpp.backend_free }
63
+ LLaMACpp = LlamaCpp
data/sig/llama_cpp.rbs CHANGED
@@ -97,15 +97,6 @@ module LLaMACpp
97
97
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
98
98
  LLAMA_KV_OVERRIDE_TYPE_STR: Integer
99
99
 
100
- LLAMA_GRETYPE_END: Integer
101
- LLAMA_GRETYPE_ALT: Integer
102
- LLAMA_GRETYPE_RULE_REF: Integer
103
- LLAMA_GRETYPE_CHAR: Integer
104
- LLAMA_GRETYPE_CHAR_NOT: Integer
105
- LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
106
- LLAMA_GRETYPE_CHAR_ALT: Integer
107
- LLAMA_GRETYPE_CHAR_ANY: Integer
108
-
109
100
  LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
110
101
  LLAMA_ROPE_SCALING_TYPE_NONE: Integer
111
102
  LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
@@ -205,20 +196,6 @@ module LLaMACpp
205
196
  def detokenize: (Array[Integer], ?remove_special: bool, ?unparse_special: bool) -> String
206
197
  end
207
198
 
208
- class Timings
209
- public
210
-
211
- def t_start_ms: () -> Float
212
- def t_end_ms: () -> Float
213
- def t_load_ms: () -> Float
214
- def t_sample_ms: () -> Float
215
- def t_p_eval_ms: () -> Float
216
- def t_eval_ms: () -> Float
217
- def n_sample: () -> Integer
218
- def n_p_eval: () -> Integer
219
- def n_eval: () -> Integer
220
- end
221
-
222
199
  class ModelKVOverride
223
200
  public
224
201
 
@@ -295,9 +272,6 @@ module LLaMACpp
295
272
  def n_seq_max: () -> Integer
296
273
  def n_threads: () -> Integer
297
274
  def n_threads_batch: () -> Integer
298
- def timings: () -> ::LLaMACpp::Timings
299
- def print_timings: () -> void
300
- def reset_timings: () -> void
301
275
  def kv_cache_token_count: () -> Integer
302
276
  def kv_cache_clear: () -> void
303
277
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
@@ -308,27 +282,10 @@ module LLaMACpp
308
282
  def kv_cache_seq_pos_max: (Integer) -> Integer
309
283
  def kv_cache_defrag: () -> void
310
284
  def kv_cache_update: () -> void
311
- def set_rng_seed: (Integer) -> void
312
285
  def set_causal_attn: (bool) -> void
313
286
  def synchronize: () -> void
314
287
  def load_session_file: (session_path: String) -> void
315
288
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
316
- def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
317
- def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
318
- def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
319
- def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
320
- def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
321
- def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
322
- def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
323
- def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
324
- def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
325
- def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
326
- def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
327
- def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
328
- def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
329
- def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
330
- def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
331
- def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
332
289
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
333
290
  def pooling_type: () -> Integer
334
291
  end
@@ -336,8 +293,6 @@ module LLaMACpp
336
293
  class ContextParams
337
294
  public
338
295
 
339
- def seed: () -> Integer
340
- def seed=: (Integer) -> Integer
341
296
  def n_ctx: () -> Integer
342
297
  def n_ctx=: (Integer) -> Integer
343
298
  def n_batch: () -> Integer
@@ -408,18 +363,4 @@ module LLaMACpp
408
363
  end
409
364
 
410
365
  class Params = ContextParams
411
-
412
- class GrammarElement
413
- public
414
-
415
- def initialize: (?type: Integer, ?value: Integer) -> void
416
- def type: () -> Integer
417
- def type=: (Integer) -> Integer
418
- def value: () -> Integer
419
- def value=: (Integer) -> Integer
420
- end
421
-
422
- class Grammar
423
- def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
424
- end
425
366
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.17.10
4
+ version: 0.18.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-09-07 00:00:00.000000000 Z
10
+ date: 2025-02-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
13
  email:
@@ -22,13 +21,8 @@ files:
22
21
  - CODE_OF_CONDUCT.md
23
22
  - LICENSE.txt
24
23
  - README.md
25
- - examples/README.md
26
- - examples/chat.rb
27
- - examples/embedding.rb
28
- - examples/prompt_jp.txt
29
- - examples/simple.rb
30
24
  - ext/llama_cpp/extconf.rb
31
- - ext/llama_cpp/llama_cpp.cpp
25
+ - ext/llama_cpp/llama_cpp.c
32
26
  - ext/llama_cpp/llama_cpp.h
33
27
  - lib/llama_cpp.rb
34
28
  - lib/llama_cpp/version.rb
@@ -42,7 +36,6 @@ metadata:
42
36
  changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
43
37
  documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
44
38
  rubygems_mfa_required: 'true'
45
- post_install_message:
46
39
  rdoc_options: []
47
40
  require_paths:
48
41
  - lib
@@ -57,8 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
50
  - !ruby/object:Gem::Version
58
51
  version: '0'
59
52
  requirements: []
60
- rubygems_version: 3.5.9
61
- signing_key:
53
+ rubygems_version: 3.6.2
62
54
  specification_version: 4
63
55
  summary: Ruby bindings for the llama.cpp.
64
56
  test_files: []
data/examples/README.md DELETED
@@ -1,92 +0,0 @@
1
- # llama_cpp.rb/examples
2
-
3
- ## chat.rb
4
-
5
- ### Usage
6
-
7
- ```sh
8
- $ cd examples
9
- $ gem install llama_cpp thor
10
- $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
- ...
12
- User: Please tell me the largest city in Japan.
13
- Bob: Sure. The largest city in Japan is Tokyo.
14
- User:
15
- ```
16
-
17
- ### Options
18
-
19
- ```sh
20
- $ ./chat.rb help main
21
- Usage:
22
- chat.rb main -m, --model=MODEL
23
-
24
- Options:
25
- -s, [--seed=N] # random seed
26
- # Default: -1
27
- -t, [--threads=N] # number of threads
28
- # Default: 2
29
- -m, --model=MODEL # path to model file
30
- -f, [--file=FILE] # prompt file to start generation
31
- -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
- -b, [--batch-size=N] # batch size for prompt processing
33
- # Default: 1024
34
- -n, [--n-predict=N] # number of tokens to predict
35
- # Default: 256
36
- [--keep=N] # number of tokens to keep from the initial prompt
37
- # Default: 48
38
- [--repeat-last-n=N] # last n tokens to consider for penalize
39
- # Default: 64
40
- [--repeat-penalty=N] # penalize repeat sequence of tokens
41
- # Default: 1.0
42
- [--presence-penalty=N] # repeat alpha presence penalty
43
- # Default: 0.0
44
- [--frequency-penalty=N] # repeat alpha frequency penalty
45
- # Default: 0.0
46
- [--top-k=N] # top k sampling
47
- # Default: 40
48
- [--top-p=N] # top p sampling
49
- # Default: 0.95
50
- [--tfs-z=N] # tail free sampling, parameter z
51
- # Default: 1.0
52
- [--typical-p=N] # locally typical sampling, parameter p
53
- # Default: 1.0
54
- [--temp=N] # temperature
55
- # Default: 0.8
56
- [--n-gpu-layers=N] # number of layers on GPU
57
- # Default: 0
58
-
59
- Start chat
60
- ```
61
-
62
- ## embedding.rb
63
-
64
- ### Usage
65
-
66
- ```sh
67
- $ cd examples
68
- $ gem install llama_cpp thor
69
- $ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
70
- ...
71
- 0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
72
- ```
73
-
74
- ### Options
75
-
76
- ```
77
- $ ./embedding.rb help main
78
- Usage:
79
- embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
80
-
81
- Options:
82
- -s, [--seed=N] # random seed
83
- # Default: -1
84
- -t, [--threads=N] # number of threads
85
- # Default: 2
86
- -m, --model=MODEL # path to model file
87
- -p, --prompt=PROMPT # prompt to generate embedding
88
- [--n-gpu-layers=N] # number of layers on GPU
89
- # Default: 0
90
-
91
- Extract embedding from prompt
92
- ```
data/examples/chat.rb DELETED
@@ -1,198 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
- # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
-
9
- require 'llama_cpp'
10
- require 'thor'
11
- require 'readline'
12
- require 'etc'
13
-
14
- class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
15
- default_command :main
16
- desc 'main', 'Start chat'
17
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
18
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
- option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
- option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
- option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
- option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
- option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
- option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
- option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
- option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
- option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
- option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
- option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
- option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
- option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
- option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
35
- def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
36
- mdl_params = LLaMACpp::ModelParams.new
37
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
38
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
39
- ctx_params = LLaMACpp::ContextParams.new
40
- ctx_params.seed = options[:seed] if options[:seed] != -1
41
- ctx_params.n_threads = options[:n_threads]
42
- ctx_params.n_threads_batch = options[:n_threads]
43
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
44
-
45
- antiprompt = options[:reverse_prompt] || 'User:'
46
- start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
47
-
48
- embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
49
-
50
- n_ctx = context.n_ctx
51
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
52
-
53
- n_keep = options[:keep]
54
- n_keep = embd_input.size if n_keep > embd_input.size
55
-
56
- last_n_tokens = [0] * n_ctx
57
- interactive = true
58
- is_interacting = false
59
- input_echo = true
60
- first_input = true
61
- embd = []
62
- n_consumed = 0
63
- n_past = 0
64
- n_remain = options[:n_predict]
65
- n_vocab = context.model.n_vocab
66
-
67
- while interactive
68
- unless embd.empty?
69
- if n_past + embd.size > n_ctx
70
- n_left = n_past - n_keep
71
- n_past = [1, n_keep].max
72
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
73
- end
74
-
75
- 0.step(embd.size - 1, options[:batch_size]) do |i|
76
- n_eval = [options[:batch_size], embd.size - i].min
77
- context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
78
- n_past += n_eval
79
- end
80
- end
81
-
82
- embd.clear
83
-
84
- if embd_input.size <= n_consumed && !is_interacting
85
- logits = context.logits
86
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
87
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
88
-
89
- last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
90
- context.sample_repetition_penalties(
91
- candidates,
92
- last_n_tokens[-last_n_repeat..],
93
- penalty_repeat: options[:repeat_penalty],
94
- penalty_freq: options[:frequency_penalty],
95
- penalty_present: options[:presence_penalty]
96
- )
97
-
98
- context.sample_top_k(candidates, k: options[:top_k])
99
- context.sample_tail_free(candidates, z: options[:tfs_z])
100
- context.sample_typical(candidates, prob: options[:typical_p])
101
- context.sample_top_p(candidates, prob: options[:top_p])
102
- context.sample_temp(candidates, temp: options[:temp])
103
- id = context.sample_token(candidates)
104
-
105
- last_n_tokens.shift
106
- last_n_tokens.push(id)
107
-
108
- if id == context.model.token_eos
109
- id = context.model.token_nl
110
- unless antiprompt.empty?
111
- first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
112
- embd_input.concat(first_antiprompt)
113
- end
114
- end
115
-
116
- embd.push(id)
117
- input_echo = true
118
- n_remain -= 1
119
- else
120
- while embd_input.size > n_consumed
121
- embd.push(embd_input[n_consumed])
122
- last_n_tokens.shift
123
- last_n_tokens.push(embd_input[n_consumed])
124
- n_consumed += 1
125
- break if embd.size >= options[:batch_size]
126
- end
127
- end
128
-
129
- if input_echo
130
- output = embd.map { |token| context.model.token_to_piece(token) }
131
- output_str = output.join
132
- output_str.chomp!(antiprompt) if first_input
133
- print(output_str)
134
- end
135
-
136
- if embd_input.size <= n_consumed
137
- if antiprompt.size.positive?
138
- last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
139
- last_output_str = last_output.join
140
-
141
- search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
142
- unless last_output_str.index(antiprompt, search_start_pos).nil?
143
- is_interacting = true
144
- true
145
- end
146
- end
147
-
148
- if n_past.positive? && is_interacting
149
- if first_input
150
- print("\r#{antiprompt}")
151
- first_input = false
152
- end
153
- buffer = Readline.readline(' ')
154
- break interactive = false if buffer.nil?
155
-
156
- if buffer.size > 1
157
- line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
158
- embd_input.concat(line_input)
159
- n_remain -= line_input.size
160
- end
161
-
162
- input_echo = false
163
- end
164
-
165
- is_interacting = false if n_past.positive?
166
- end
167
-
168
- if n_remain <= 0 && options[:n_predict] != -1
169
- n_remain = options[:n_predict]
170
- is_interacting = true
171
- end
172
- end
173
- end
174
-
175
- private
176
-
177
- def read_prompt(filename)
178
- return if filename.nil?
179
-
180
- File.read(filename).chomp
181
- end
182
-
183
- def default_prompt(antiprompt)
184
- # Reference:
185
- # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
186
- prompt = <<~MSG
187
- Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
188
-
189
- User: Hello, Bob.
190
- Bob: Hello. How may I help you today?
191
- User: Please tell me the largest city in Europe.
192
- Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
193
- MSG
194
- prompt + antiprompt
195
- end
196
- end
197
-
198
- Chat.start(ARGV)
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # embedding.rb extracts embedding from prompt.
5
- # It is created with reference to embedding.cpp in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
7
-
8
- require 'llama_cpp'
9
- require 'thor'
10
- require 'etc'
11
-
12
- class Embedding < Thor # rubocop:disable Style/Documentation
13
- default_command :main
14
- desc 'main', 'Extract embedding from prompt'
15
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
16
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
17
- option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
18
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
20
- def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
21
- mdl_params = LLaMACpp::ModelParams.new
22
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
23
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
24
- ctx_params = LLaMACpp::ContextParams.new
25
- ctx_params.embedding = true
26
- ctx_params.seed = options[:seed] if options[:seed] != -1
27
- ctx_params.n_threads = options[:n_threads]
28
- ctx_params.n_threads_batch = options[:n_threads]
29
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
30
-
31
- embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
32
-
33
- return unless embd_input.size.positive?
34
-
35
- context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
36
-
37
- context.embeddings.each { |val| print("#{val} ") }
38
- print("\n")
39
- end
40
- end
41
-
42
- Embedding.start(ARGV)
@@ -1,8 +0,0 @@
1
- UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
- Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
-
4
- User: こんにちには、Taro。
5
- Taro: こんにちは、今日はどのような要件ですか?
6
- User: 日本で最大の都市について教えてください。
7
- Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
8
- User: