llama_cpp 0.17.9 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,6 @@
1
1
  #ifndef LLAMA_CPP_RB_H
2
2
  #define LLAMA_CPP_RB_H 1
3
3
 
4
- #include <algorithm>
5
- #include <sstream>
6
- #include <string>
7
- #include <vector>
8
-
9
4
  #include <llama.h>
10
5
 
11
6
  #include <ruby.h>
@@ -1,10 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
- module LLaMACpp
4
+ module LlamaCpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.17.9'
6
+ VERSION = '0.18.0'
7
7
 
8
8
  # The supported version of llama.cpp.
9
- LLAMA_CPP_VERSION = 'b3639'
9
+ LLAMA_CPP_VERSION = 'b4611'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
5
 
6
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
- module LLaMACpp
7
+ module LlamaCpp
8
8
  module_function
9
9
 
10
10
  # Generates sentences following the given prompt for operation check.
11
11
  #
12
- # @param context [LLaMACpp::Context] The context to use.
12
+ # @param context [LlamaCpp::LlamaContext] The context to use.
13
13
  # @param prompt [String] The prompt to start generation with.
14
14
  # @param n_predict [Integer] The number of tokens to predict.
15
- # @param n_keep [Integer] The number of tokens to keep in the context.
16
- # @param n_batch [Integer] The number of tokens to process in a batch.
17
- # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
18
- # @param repeat_penalty [Float] The repetition penalty.
19
- # @param frequency [Float] The frequency penalty.
20
- # @param presence [Float] The presence penalty.
21
- # @param top_k [Integer] The number of tokens to consider for top-k sampling.
22
- # @param top_p [Float] The probability threshold for nucleus sampling.
23
- # @param tfs_z [Float] The z parameter for tail-free sampling.
24
- # @param typical_p [Float] The probability for typical sampling.
25
- # @param temperature [Float] The temperature for temperature sampling.
26
15
  # @return [String]
27
- def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
28
- n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
29
- repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
30
- top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
31
- raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
16
+ def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
17
+ raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
32
18
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
33
19
 
34
- spaced_prompt = " #{prompt}"
35
- embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
20
+ model = LlamaCpp.llama_get_model(context)
21
+ vocab = LlamaCpp.llama_model_get_vocab(model)
36
22
 
37
- n_ctx = context.n_ctx
38
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
23
+ n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
39
24
 
40
- last_n_tokens = [0] * n_ctx
25
+ prompt_tokens = []
26
+ raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
27
+ true).negative?
41
28
 
42
- embd = []
43
- n_consumed = 0
44
- n_past = 0
45
- n_remain = n_predict
46
- n_vocab = context.model.n_vocab
29
+ ctx_params = LlamaCpp::LlamaContextParams.new
30
+ ctx_params.n_ctx = n_prompt + n_predict - 1
31
+ ctx_params.n_batch = n_prompt
32
+ ctx_params.no_perf = false
33
+
34
+ ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
35
+
36
+ sparams = LlamaCpp::LlamaSamplerChainParams.new
37
+ sparams.no_perf = false
38
+ smpl = LlamaCpp.llama_sampler_chain_init(sparams)
39
+ LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
40
+
41
+ batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
42
+
43
+ n_pos = 0
47
44
  output = []
45
+ while n_pos + batch.n_tokens < n_prompt + n_predict
46
+ break if LlamaCpp.llama_decode(ctx, batch) != 0
47
+
48
+ n_pos += batch.n_tokens
49
+
50
+ new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
51
+ break if llama_vocab_is_eog?(vocab, new_token_id)
52
+
53
+ buf = llama_token_to_piece(vocab, new_token_id, 0, true)
54
+ output << buf
48
55
 
49
- while n_remain != 0
50
- unless embd.empty?
51
- if n_past + embd.size > n_ctx
52
- n_left = n_past - n_keep
53
- n_past = n_keep
54
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
55
- end
56
-
57
- context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
58
- end
59
-
60
- n_past += embd.size
61
- embd.clear
62
-
63
- if embd_input.size <= n_consumed
64
- logits = context.logits
65
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
66
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
67
-
68
- # apply penalties
69
- last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalties(
71
- candidates, last_n_tokens[-last_n_repeat..],
72
- penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
- )
74
-
75
- # temperature sampling
76
- context.sample_top_k(candidates, k: top_k)
77
- context.sample_tail_free(candidates, z: tfs_z)
78
- context.sample_typical(candidates, prob: typical_p)
79
- context.sample_top_p(candidates, prob: top_p)
80
- context.sample_temp(candidates, temp: temperature)
81
- id = context.sample_token(candidates)
82
-
83
- last_n_tokens.shift
84
- last_n_tokens.push(id)
85
-
86
- embd.push(id)
87
- n_remain -= 1
88
- else
89
- while embd_input.size > n_consumed
90
- embd.push(embd_input[n_consumed])
91
- last_n_tokens.shift
92
- last_n_tokens.push(embd_input[n_consumed])
93
- n_consumed += 1
94
- break if embd.size >= n_batch
95
- end
96
- end
97
-
98
- embd.each { |token| output << context.model.token_to_piece(token) }
99
-
100
- break if !embd.empty? && embd[-1] == context.model.token_eos
56
+ batch = LlamaCpp.llama_batch_get_one([new_token_id])
101
57
  end
102
58
 
103
- output.join.scrub('?').strip.delete_prefix(prompt).strip
59
+ output.join
104
60
  end
105
61
  end
106
62
 
107
- LLaMACpp.backend_init
108
- at_exit { LLaMACpp.backend_free }
63
+ LLaMACpp = LlamaCpp
data/sig/llama_cpp.rbs CHANGED
@@ -16,6 +16,7 @@ module LLaMACpp
16
16
  LLAMA_VOCAB_TYPE_BPE: Integer
17
17
  LLAMA_VOCAB_TYPE_WPM: Integer
18
18
  LLAMA_VOCAB_TYPE_UGM: Integer
19
+ LLAMA_VOCAB_TYPE_RWKV: Integer
19
20
 
20
21
  LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
21
22
  LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
@@ -87,6 +88,8 @@ module LLaMACpp
87
88
  LLAMA_FTYPE_MOSTLY_Q4_0_4_4: Integer
88
89
  LLAMA_FTYPE_MOSTLY_Q4_0_4_8: Integer
89
90
  LLAMA_FTYPE_MOSTLY_Q4_0_8_8: Integer
91
+ LLAMA_FTYPE_MOSTLY_TQ1_0: Integer
92
+ LLAMA_FTYPE_MOSTLY_TQ2_0: Integer
90
93
  LLAMA_FTYPE_GUESSED: Integer
91
94
 
92
95
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
@@ -94,15 +97,6 @@ module LLaMACpp
94
97
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
95
98
  LLAMA_KV_OVERRIDE_TYPE_STR: Integer
96
99
 
97
- LLAMA_GRETYPE_END: Integer
98
- LLAMA_GRETYPE_ALT: Integer
99
- LLAMA_GRETYPE_RULE_REF: Integer
100
- LLAMA_GRETYPE_CHAR: Integer
101
- LLAMA_GRETYPE_CHAR_NOT: Integer
102
- LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
103
- LLAMA_GRETYPE_CHAR_ALT: Integer
104
- LLAMA_GRETYPE_CHAR_ANY: Integer
105
-
106
100
  LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED: Integer
107
101
  LLAMA_ROPE_SCALING_TYPE_NONE: Integer
108
102
  LLAMA_ROPE_SCALING_TYPE_LINEAR: Integer
@@ -202,20 +196,6 @@ module LLaMACpp
202
196
  def detokenize: (Array[Integer], ?remove_special: bool, ?unparse_special: bool) -> String
203
197
  end
204
198
 
205
- class Timings
206
- public
207
-
208
- def t_start_ms: () -> Float
209
- def t_end_ms: () -> Float
210
- def t_load_ms: () -> Float
211
- def t_sample_ms: () -> Float
212
- def t_p_eval_ms: () -> Float
213
- def t_eval_ms: () -> Float
214
- def n_sample: () -> Integer
215
- def n_p_eval: () -> Integer
216
- def n_eval: () -> Integer
217
- end
218
-
219
199
  class ModelKVOverride
220
200
  public
221
201
 
@@ -292,9 +272,6 @@ module LLaMACpp
292
272
  def n_seq_max: () -> Integer
293
273
  def n_threads: () -> Integer
294
274
  def n_threads_batch: () -> Integer
295
- def timings: () -> ::LLaMACpp::Timings
296
- def print_timings: () -> void
297
- def reset_timings: () -> void
298
275
  def kv_cache_token_count: () -> Integer
299
276
  def kv_cache_clear: () -> void
300
277
  def kv_cache_seq_rm: (Integer, Integer,Integer) -> void
@@ -305,27 +282,10 @@ module LLaMACpp
305
282
  def kv_cache_seq_pos_max: (Integer) -> Integer
306
283
  def kv_cache_defrag: () -> void
307
284
  def kv_cache_update: () -> void
308
- def set_rng_seed: (Integer) -> void
309
285
  def set_causal_attn: (bool) -> void
310
286
  def synchronize: () -> void
311
287
  def load_session_file: (session_path: String) -> void
312
288
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
313
- def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
314
- def sample_apply_guidance: (logits: Array[Float], logits_guidance: Array[Float], scale: Float) -> void
315
- def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
316
- def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
317
- def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
318
- def sample_min_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
319
- def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
320
- def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
321
- def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
322
- def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
323
- def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
324
- def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
325
- def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
326
- def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
327
- def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
328
- def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
329
289
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
330
290
  def pooling_type: () -> Integer
331
291
  end
@@ -333,8 +293,6 @@ module LLaMACpp
333
293
  class ContextParams
334
294
  public
335
295
 
336
- def seed: () -> Integer
337
- def seed=: (Integer) -> Integer
338
296
  def n_ctx: () -> Integer
339
297
  def n_ctx=: (Integer) -> Integer
340
298
  def n_batch: () -> Integer
@@ -405,18 +363,4 @@ module LLaMACpp
405
363
  end
406
364
 
407
365
  class Params = ContextParams
408
-
409
- class GrammarElement
410
- public
411
-
412
- def initialize: (?type: Integer, ?value: Integer) -> void
413
- def type: () -> Integer
414
- def type=: (Integer) -> Integer
415
- def value: () -> Integer
416
- def value=: (Integer) -> Integer
417
- end
418
-
419
- class Grammar
420
- def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
421
- end
422
366
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.17.9
4
+ version: 0.18.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-09-07 00:00:00.000000000 Z
10
+ date: 2025-02-02 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
13
  email:
@@ -22,13 +21,8 @@ files:
22
21
  - CODE_OF_CONDUCT.md
23
22
  - LICENSE.txt
24
23
  - README.md
25
- - examples/README.md
26
- - examples/chat.rb
27
- - examples/embedding.rb
28
- - examples/prompt_jp.txt
29
- - examples/simple.rb
30
24
  - ext/llama_cpp/extconf.rb
31
- - ext/llama_cpp/llama_cpp.cpp
25
+ - ext/llama_cpp/llama_cpp.c
32
26
  - ext/llama_cpp/llama_cpp.h
33
27
  - lib/llama_cpp.rb
34
28
  - lib/llama_cpp/version.rb
@@ -42,7 +36,6 @@ metadata:
42
36
  changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
43
37
  documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
44
38
  rubygems_mfa_required: 'true'
45
- post_install_message:
46
39
  rdoc_options: []
47
40
  require_paths:
48
41
  - lib
@@ -57,8 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
50
  - !ruby/object:Gem::Version
58
51
  version: '0'
59
52
  requirements: []
60
- rubygems_version: 3.5.9
61
- signing_key:
53
+ rubygems_version: 3.6.2
62
54
  specification_version: 4
63
55
  summary: Ruby bindings for the llama.cpp.
64
56
  test_files: []
data/examples/README.md DELETED
@@ -1,92 +0,0 @@
1
- # llama_cpp.rb/examples
2
-
3
- ## chat.rb
4
-
5
- ### Usage
6
-
7
- ```sh
8
- $ cd examples
9
- $ gem install llama_cpp thor
10
- $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
- ...
12
- User: Please tell me the largest city in Japan.
13
- Bob: Sure. The largest city in Japan is Tokyo.
14
- User:
15
- ```
16
-
17
- ### Options
18
-
19
- ```sh
20
- $ ./chat.rb help main
21
- Usage:
22
- chat.rb main -m, --model=MODEL
23
-
24
- Options:
25
- -s, [--seed=N] # random seed
26
- # Default: -1
27
- -t, [--threads=N] # number of threads
28
- # Default: 2
29
- -m, --model=MODEL # path to model file
30
- -f, [--file=FILE] # prompt file to start generation
31
- -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
- -b, [--batch-size=N] # batch size for prompt processing
33
- # Default: 1024
34
- -n, [--n-predict=N] # number of tokens to predict
35
- # Default: 256
36
- [--keep=N] # number of tokens to keep from the initial prompt
37
- # Default: 48
38
- [--repeat-last-n=N] # last n tokens to consider for penalize
39
- # Default: 64
40
- [--repeat-penalty=N] # penalize repeat sequence of tokens
41
- # Default: 1.0
42
- [--presence-penalty=N] # repeat alpha presence penalty
43
- # Default: 0.0
44
- [--frequency-penalty=N] # repeat alpha frequency penalty
45
- # Default: 0.0
46
- [--top-k=N] # top k sampling
47
- # Default: 40
48
- [--top-p=N] # top p sampling
49
- # Default: 0.95
50
- [--tfs-z=N] # tail free sampling, parameter z
51
- # Default: 1.0
52
- [--typical-p=N] # locally typical sampling, parameter p
53
- # Default: 1.0
54
- [--temp=N] # temperature
55
- # Default: 0.8
56
- [--n-gpu-layers=N] # number of layers on GPU
57
- # Default: 0
58
-
59
- Start chat
60
- ```
61
-
62
- ## embedding.rb
63
-
64
- ### Usage
65
-
66
- ```sh
67
- $ cd examples
68
- $ gem install llama_cpp thor
69
- $ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
70
- ...
71
- 0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
72
- ```
73
-
74
- ### Options
75
-
76
- ```
77
- $ ./embedding.rb help main
78
- Usage:
79
- embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
80
-
81
- Options:
82
- -s, [--seed=N] # random seed
83
- # Default: -1
84
- -t, [--threads=N] # number of threads
85
- # Default: 2
86
- -m, --model=MODEL # path to model file
87
- -p, --prompt=PROMPT # prompt to generate embedding
88
- [--n-gpu-layers=N] # number of layers on GPU
89
- # Default: 0
90
-
91
- Extract embedding from prompt
92
- ```
data/examples/chat.rb DELETED
@@ -1,198 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
- # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
-
9
- require 'llama_cpp'
10
- require 'thor'
11
- require 'readline'
12
- require 'etc'
13
-
14
- class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
15
- default_command :main
16
- desc 'main', 'Start chat'
17
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
18
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
- option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
- option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
- option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
- option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
- option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
- option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
- option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
- option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
- option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
- option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
- option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
- option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
- option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
- option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
35
- def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
36
- mdl_params = LLaMACpp::ModelParams.new
37
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
38
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
39
- ctx_params = LLaMACpp::ContextParams.new
40
- ctx_params.seed = options[:seed] if options[:seed] != -1
41
- ctx_params.n_threads = options[:n_threads]
42
- ctx_params.n_threads_batch = options[:n_threads]
43
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
44
-
45
- antiprompt = options[:reverse_prompt] || 'User:'
46
- start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
47
-
48
- embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
49
-
50
- n_ctx = context.n_ctx
51
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
52
-
53
- n_keep = options[:keep]
54
- n_keep = embd_input.size if n_keep > embd_input.size
55
-
56
- last_n_tokens = [0] * n_ctx
57
- interactive = true
58
- is_interacting = false
59
- input_echo = true
60
- first_input = true
61
- embd = []
62
- n_consumed = 0
63
- n_past = 0
64
- n_remain = options[:n_predict]
65
- n_vocab = context.model.n_vocab
66
-
67
- while interactive
68
- unless embd.empty?
69
- if n_past + embd.size > n_ctx
70
- n_left = n_past - n_keep
71
- n_past = [1, n_keep].max
72
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
73
- end
74
-
75
- 0.step(embd.size - 1, options[:batch_size]) do |i|
76
- n_eval = [options[:batch_size], embd.size - i].min
77
- context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
78
- n_past += n_eval
79
- end
80
- end
81
-
82
- embd.clear
83
-
84
- if embd_input.size <= n_consumed && !is_interacting
85
- logits = context.logits
86
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
87
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
88
-
89
- last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
90
- context.sample_repetition_penalties(
91
- candidates,
92
- last_n_tokens[-last_n_repeat..],
93
- penalty_repeat: options[:repeat_penalty],
94
- penalty_freq: options[:frequency_penalty],
95
- penalty_present: options[:presence_penalty]
96
- )
97
-
98
- context.sample_top_k(candidates, k: options[:top_k])
99
- context.sample_tail_free(candidates, z: options[:tfs_z])
100
- context.sample_typical(candidates, prob: options[:typical_p])
101
- context.sample_top_p(candidates, prob: options[:top_p])
102
- context.sample_temp(candidates, temp: options[:temp])
103
- id = context.sample_token(candidates)
104
-
105
- last_n_tokens.shift
106
- last_n_tokens.push(id)
107
-
108
- if id == context.model.token_eos
109
- id = context.model.token_nl
110
- unless antiprompt.empty?
111
- first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
112
- embd_input.concat(first_antiprompt)
113
- end
114
- end
115
-
116
- embd.push(id)
117
- input_echo = true
118
- n_remain -= 1
119
- else
120
- while embd_input.size > n_consumed
121
- embd.push(embd_input[n_consumed])
122
- last_n_tokens.shift
123
- last_n_tokens.push(embd_input[n_consumed])
124
- n_consumed += 1
125
- break if embd.size >= options[:batch_size]
126
- end
127
- end
128
-
129
- if input_echo
130
- output = embd.map { |token| context.model.token_to_piece(token) }
131
- output_str = output.join
132
- output_str.chomp!(antiprompt) if first_input
133
- print(output_str)
134
- end
135
-
136
- if embd_input.size <= n_consumed
137
- if antiprompt.size.positive?
138
- last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
139
- last_output_str = last_output.join
140
-
141
- search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
142
- unless last_output_str.index(antiprompt, search_start_pos).nil?
143
- is_interacting = true
144
- true
145
- end
146
- end
147
-
148
- if n_past.positive? && is_interacting
149
- if first_input
150
- print("\r#{antiprompt}")
151
- first_input = false
152
- end
153
- buffer = Readline.readline(' ')
154
- break interactive = false if buffer.nil?
155
-
156
- if buffer.size > 1
157
- line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
158
- embd_input.concat(line_input)
159
- n_remain -= line_input.size
160
- end
161
-
162
- input_echo = false
163
- end
164
-
165
- is_interacting = false if n_past.positive?
166
- end
167
-
168
- if n_remain <= 0 && options[:n_predict] != -1
169
- n_remain = options[:n_predict]
170
- is_interacting = true
171
- end
172
- end
173
- end
174
-
175
- private
176
-
177
- def read_prompt(filename)
178
- return if filename.nil?
179
-
180
- File.read(filename).chomp
181
- end
182
-
183
- def default_prompt(antiprompt)
184
- # Reference:
185
- # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
186
- prompt = <<~MSG
187
- Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
188
-
189
- User: Hello, Bob.
190
- Bob: Hello. How may I help you today?
191
- User: Please tell me the largest city in Europe.
192
- Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
193
- MSG
194
- prompt + antiprompt
195
- end
196
- end
197
-
198
- Chat.start(ARGV)
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # embedding.rb extracts embedding from prompt.
5
- # It is created with reference to embedding.cpp in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
7
-
8
- require 'llama_cpp'
9
- require 'thor'
10
- require 'etc'
11
-
12
- class Embedding < Thor # rubocop:disable Style/Documentation
13
- default_command :main
14
- desc 'main', 'Extract embedding from prompt'
15
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
16
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
17
- option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
18
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
20
- def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
21
- mdl_params = LLaMACpp::ModelParams.new
22
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
23
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
24
- ctx_params = LLaMACpp::ContextParams.new
25
- ctx_params.embedding = true
26
- ctx_params.seed = options[:seed] if options[:seed] != -1
27
- ctx_params.n_threads = options[:n_threads]
28
- ctx_params.n_threads_batch = options[:n_threads]
29
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
30
-
31
- embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
32
-
33
- return unless embd_input.size.positive?
34
-
35
- context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
36
-
37
- context.embeddings.each { |val| print("#{val} ") }
38
- print("\n")
39
- end
40
- end
41
-
42
- Embedding.start(ARGV)
@@ -1,8 +0,0 @@
1
- UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
- Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
-
4
- User: こんにちには、Taro。
5
- Taro: こんにちは、今日はどのような要件ですか?
6
- User: 日本で最大の都市について教えてください。
7
- Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
8
- User: