llama_cpp 0.17.10 → 0.18.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,6 @@
1
1
  #ifndef LLAMA_CPP_RB_H
2
2
  #define LLAMA_CPP_RB_H 1
3
3
 
4
- #include <algorithm>
5
- #include <sstream>
6
- #include <string>
7
- #include <vector>
8
-
9
4
  #include <llama.h>
10
5
 
11
6
  #include <ruby.h>
@@ -1,10 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
- module LLaMACpp
4
+ module LlamaCpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.17.10'
6
+ VERSION = '0.18.1'
7
7
 
8
8
  # The supported version of llama.cpp.
9
- LLAMA_CPP_VERSION = 'b3676'
9
+ LLAMA_CPP_VERSION = 'b4713'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -4,105 +4,60 @@ require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
5
 
6
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
- module LLaMACpp
7
+ module LlamaCpp
8
8
  module_function
9
9
 
10
10
  # Generates sentences following the given prompt for operation check.
11
11
  #
12
- # @param context [LLaMACpp::Context] The context to use.
12
+ # @param context [LlamaCpp::LlamaContext] The context to use.
13
13
  # @param prompt [String] The prompt to start generation with.
14
14
  # @param n_predict [Integer] The number of tokens to predict.
15
- # @param n_keep [Integer] The number of tokens to keep in the context.
16
- # @param n_batch [Integer] The number of tokens to process in a batch.
17
- # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
18
- # @param repeat_penalty [Float] The repetition penalty.
19
- # @param frequency [Float] The frequency penalty.
20
- # @param presence [Float] The presence penalty.
21
- # @param top_k [Integer] The number of tokens to consider for top-k sampling.
22
- # @param top_p [Float] The probability threshold for nucleus sampling.
23
- # @param tfs_z [Float] The z parameter for tail-free sampling.
24
- # @param typical_p [Float] The probability for typical sampling.
25
- # @param temperature [Float] The temperature for temperature sampling.
26
15
  # @return [String]
27
- def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
28
- n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
29
- repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
30
- top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
31
- raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
16
+ def generate(context, prompt, n_predict: 128) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
17
+ raise ArgumentError, 'context must be a LlamaContext' unless context.is_a?(LlamaCpp::LlamaContext)
32
18
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
33
19
 
34
- spaced_prompt = " #{prompt}"
35
- embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)
20
+ model = LlamaCpp.llama_get_model(context)
21
+ vocab = LlamaCpp.llama_model_get_vocab(model)
36
22
 
37
- n_ctx = context.n_ctx
38
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
23
+ n_prompt = -LlamaCpp.llama_tokenize(vocab, prompt, [], 0, true, true)
39
24
 
40
- last_n_tokens = [0] * n_ctx
25
+ prompt_tokens = []
26
+ raise 'Failed to tokenize the prompt' if LlamaCpp.llama_tokenize(vocab, prompt, prompt_tokens, n_prompt, true,
27
+ true).negative?
41
28
 
42
- embd = []
43
- n_consumed = 0
44
- n_past = 0
45
- n_remain = n_predict
46
- n_vocab = context.model.n_vocab
29
+ ctx_params = LlamaCpp::LlamaContextParams.new
30
+ ctx_params.n_ctx = n_prompt + n_predict - 1
31
+ ctx_params.n_batch = n_prompt
32
+ ctx_params.no_perf = false
33
+
34
+ ctx = LlamaCpp.llama_init_from_model(model, ctx_params)
35
+
36
+ sparams = LlamaCpp::LlamaSamplerChainParams.new
37
+ sparams.no_perf = false
38
+ smpl = LlamaCpp.llama_sampler_chain_init(sparams)
39
+ LlamaCpp.llama_sampler_chain_add(smpl, LlamaCpp.llama_sampler_init_greedy)
40
+
41
+ batch = LlamaCpp.llama_batch_get_one(prompt_tokens)
42
+
43
+ n_pos = 0
47
44
  output = []
45
+ while n_pos + batch.n_tokens < n_prompt + n_predict
46
+ break if LlamaCpp.llama_decode(ctx, batch) != 0
47
+
48
+ n_pos += batch.n_tokens
49
+
50
+ new_token_id = LlamaCpp.llama_sampler_sample(smpl, ctx, -1)
51
+ break if llama_vocab_is_eog?(vocab, new_token_id)
52
+
53
+ buf = llama_token_to_piece(vocab, new_token_id, 0, true)
54
+ output << buf
48
55
 
49
- while n_remain != 0
50
- unless embd.empty?
51
- if n_past + embd.size > n_ctx
52
- n_left = n_past - n_keep
53
- n_past = n_keep
54
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
55
- end
56
-
57
- context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
58
- end
59
-
60
- n_past += embd.size
61
- embd.clear
62
-
63
- if embd_input.size <= n_consumed
64
- logits = context.logits
65
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
66
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
67
-
68
- # apply penalties
69
- last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
70
- context.sample_repetition_penalties(
71
- candidates, last_n_tokens[-last_n_repeat..],
72
- penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
73
- )
74
-
75
- # temperature sampling
76
- context.sample_top_k(candidates, k: top_k)
77
- context.sample_tail_free(candidates, z: tfs_z)
78
- context.sample_typical(candidates, prob: typical_p)
79
- context.sample_top_p(candidates, prob: top_p)
80
- context.sample_temp(candidates, temp: temperature)
81
- id = context.sample_token(candidates)
82
-
83
- last_n_tokens.shift
84
- last_n_tokens.push(id)
85
-
86
- embd.push(id)
87
- n_remain -= 1
88
- else
89
- while embd_input.size > n_consumed
90
- embd.push(embd_input[n_consumed])
91
- last_n_tokens.shift
92
- last_n_tokens.push(embd_input[n_consumed])
93
- n_consumed += 1
94
- break if embd.size >= n_batch
95
- end
96
- end
97
-
98
- embd.each { |token| output << context.model.token_to_piece(token) }
99
-
100
- break if !embd.empty? && embd[-1] == context.model.token_eos
56
+ batch = LlamaCpp.llama_batch_get_one([new_token_id])
101
57
  end
102
58
 
103
- output.join.scrub('?').strip.delete_prefix(prompt).strip
59
+ output.join
104
60
  end
105
61
  end
106
62
 
107
- LLaMACpp.backend_init
108
- at_exit { LLaMACpp.backend_free }
63
+ LLaMACpp = LlamaCpp
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.17.10
4
+ version: 0.18.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-09-07 00:00:00.000000000 Z
10
+ date: 2025-02-15 00:00:00.000000000 Z
12
11
  dependencies: []
13
12
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
13
  email:
@@ -22,17 +21,11 @@ files:
22
21
  - CODE_OF_CONDUCT.md
23
22
  - LICENSE.txt
24
23
  - README.md
25
- - examples/README.md
26
- - examples/chat.rb
27
- - examples/embedding.rb
28
- - examples/prompt_jp.txt
29
- - examples/simple.rb
30
24
  - ext/llama_cpp/extconf.rb
31
- - ext/llama_cpp/llama_cpp.cpp
25
+ - ext/llama_cpp/llama_cpp.c
32
26
  - ext/llama_cpp/llama_cpp.h
33
27
  - lib/llama_cpp.rb
34
28
  - lib/llama_cpp/version.rb
35
- - sig/llama_cpp.rbs
36
29
  homepage: https://github.com/yoshoku/llama_cpp.rb
37
30
  licenses:
38
31
  - MIT
@@ -42,7 +35,6 @@ metadata:
42
35
  changelog_uri: https://github.com/yoshoku/llama_cpp.rb/blob/main/CHANGELOG.md
43
36
  documentation_uri: https://yoshoku.github.io/llama_cpp.rb/doc/
44
37
  rubygems_mfa_required: 'true'
45
- post_install_message:
46
38
  rdoc_options: []
47
39
  require_paths:
48
40
  - lib
@@ -57,8 +49,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
57
49
  - !ruby/object:Gem::Version
58
50
  version: '0'
59
51
  requirements: []
60
- rubygems_version: 3.5.9
61
- signing_key:
52
+ rubygems_version: 3.6.2
62
53
  specification_version: 4
63
54
  summary: Ruby bindings for the llama.cpp.
64
55
  test_files: []
data/examples/README.md DELETED
@@ -1,92 +0,0 @@
1
- # llama_cpp.rb/examples
2
-
3
- ## chat.rb
4
-
5
- ### Usage
6
-
7
- ```sh
8
- $ cd examples
9
- $ gem install llama_cpp thor
10
- $ ./chat.rb -m /path/to/quantized-model.bin -t 4
11
- ...
12
- User: Please tell me the largest city in Japan.
13
- Bob: Sure. The largest city in Japan is Tokyo.
14
- User:
15
- ```
16
-
17
- ### Options
18
-
19
- ```sh
20
- $ ./chat.rb help main
21
- Usage:
22
- chat.rb main -m, --model=MODEL
23
-
24
- Options:
25
- -s, [--seed=N] # random seed
26
- # Default: -1
27
- -t, [--threads=N] # number of threads
28
- # Default: 2
29
- -m, --model=MODEL # path to model file
30
- -f, [--file=FILE] # prompt file to start generation
31
- -r, [--reverse-prompt=REVERSE_PROMPT] # halt generation at PROMPT, return control in interactive mode
32
- -b, [--batch-size=N] # batch size for prompt processing
33
- # Default: 1024
34
- -n, [--n-predict=N] # number of tokens to predict
35
- # Default: 256
36
- [--keep=N] # number of tokens to keep from the initial prompt
37
- # Default: 48
38
- [--repeat-last-n=N] # last n tokens to consider for penalize
39
- # Default: 64
40
- [--repeat-penalty=N] # penalize repeat sequence of tokens
41
- # Default: 1.0
42
- [--presence-penalty=N] # repeat alpha presence penalty
43
- # Default: 0.0
44
- [--frequency-penalty=N] # repeat alpha frequency penalty
45
- # Default: 0.0
46
- [--top-k=N] # top k sampling
47
- # Default: 40
48
- [--top-p=N] # top p sampling
49
- # Default: 0.95
50
- [--tfs-z=N] # tail free sampling, parameter z
51
- # Default: 1.0
52
- [--typical-p=N] # locally typical sampling, parameter p
53
- # Default: 1.0
54
- [--temp=N] # temperature
55
- # Default: 0.8
56
- [--n-gpu-layers=N] # number of layers on GPU
57
- # Default: 0
58
-
59
- Start chat
60
- ```
61
-
62
- ## embedding.rb
63
-
64
- ### Usage
65
-
66
- ```sh
67
- $ cd examples
68
- $ gem install llama_cpp thor
69
- $ ./embedding.rb -m /path/to/quantized-model.bin -t 4 -p 'Hello, World.'
70
- ...
71
- 0.7191136479377747 0.5564611554145813 1.4210394620895386 -1.4874695539474487
72
- ```
73
-
74
- ### Options
75
-
76
- ```
77
- $ ./embedding.rb help main
78
- Usage:
79
- embedding.rb main -m, --model=MODEL -p, --prompt=PROMPT
80
-
81
- Options:
82
- -s, [--seed=N] # random seed
83
- # Default: -1
84
- -t, [--threads=N] # number of threads
85
- # Default: 2
86
- -m, --model=MODEL # path to model file
87
- -p, --prompt=PROMPT # prompt to generate embedding
88
- [--n-gpu-layers=N] # number of layers on GPU
89
- # Default: 0
90
-
91
- Extract embedding from prompt
92
- ```
data/examples/chat.rb DELETED
@@ -1,198 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # chat.rb is a simple chatbot that uses llama_cpp to generate text.
5
- # It is created with reference to main.cpp and chat.sh in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp
7
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/chat.sh
8
-
9
- require 'llama_cpp'
10
- require 'thor'
11
- require 'readline'
12
- require 'etc'
13
-
14
- class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
15
- default_command :main
16
- desc 'main', 'Start chat'
17
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
18
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
19
- option :file, type: :string, aliases: '-f', desc: 'prompt file to start generation'
20
- option :reverse_prompt, type: :string, aliases: '-r', desc: 'halt generation at PROMPT, return control in interactive mode'
21
- option :batch_size, type: :numeric, aliases: '-b', desc: 'batch size for prompt processing', default: 1024
22
- option :n_predict, type: :numeric, aliases: '-n', desc: 'number of tokens to predict', default: 256
23
- option :keep, type: :numeric, desc: 'number of tokens to keep from the initial prompt', default: 48
24
- option :repeat_last_n, type: :numeric, desc: 'last n tokens to consider for penalize', default: 64
25
- option :repeat_penalty, type: :numeric, desc: 'penalize repeat sequence of tokens', default: 1.0
26
- option :presence_penalty, type: :numeric, desc: 'repeat alpha presence penalty', default: 0.0
27
- option :frequency_penalty, type: :numeric, desc: 'repeat alpha frequency penalty', default: 0.0
28
- option :top_k, type: :numeric, desc: 'top k sampling', default: 40
29
- option :top_p, type: :numeric, desc: 'top p sampling', default: 0.95
30
- option :tfs_z, type: :numeric, desc: 'tail free sampling, parameter z', default: 1.0
31
- option :typical_p, type: :numeric, desc: 'locally typical sampling, parameter p', default: 1.0
32
- option :temp, type: :numeric, desc: 'temperature', default: 0.8
33
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
35
- def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
36
- mdl_params = LLaMACpp::ModelParams.new
37
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
38
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
39
- ctx_params = LLaMACpp::ContextParams.new
40
- ctx_params.seed = options[:seed] if options[:seed] != -1
41
- ctx_params.n_threads = options[:n_threads]
42
- ctx_params.n_threads_batch = options[:n_threads]
43
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
44
-
45
- antiprompt = options[:reverse_prompt] || 'User:'
46
- start_prompt = read_prompt(options[:file]) || default_prompt(antiprompt)
47
-
48
- embd_input = context.model.tokenize(text: start_prompt, add_bos: true)
49
-
50
- n_ctx = context.n_ctx
51
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
52
-
53
- n_keep = options[:keep]
54
- n_keep = embd_input.size if n_keep > embd_input.size
55
-
56
- last_n_tokens = [0] * n_ctx
57
- interactive = true
58
- is_interacting = false
59
- input_echo = true
60
- first_input = true
61
- embd = []
62
- n_consumed = 0
63
- n_past = 0
64
- n_remain = options[:n_predict]
65
- n_vocab = context.model.n_vocab
66
-
67
- while interactive
68
- unless embd.empty?
69
- if n_past + embd.size > n_ctx
70
- n_left = n_past - n_keep
71
- n_past = [1, n_keep].max
72
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
73
- end
74
-
75
- 0.step(embd.size - 1, options[:batch_size]) do |i|
76
- n_eval = [options[:batch_size], embd.size - i].min
77
- context.decode(LLaMACpp::Batch.get_one(tokens: embd[i...(i + n_eval)], n_tokens: n_eval, pos_zero: n_past, seq_id: 0))
78
- n_past += n_eval
79
- end
80
- end
81
-
82
- embd.clear
83
-
84
- if embd_input.size <= n_consumed && !is_interacting
85
- logits = context.logits
86
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
87
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
88
-
89
- last_n_repeat = [last_n_tokens.size, options[:repeat_last_n], n_ctx].min
90
- context.sample_repetition_penalties(
91
- candidates,
92
- last_n_tokens[-last_n_repeat..],
93
- penalty_repeat: options[:repeat_penalty],
94
- penalty_freq: options[:frequency_penalty],
95
- penalty_present: options[:presence_penalty]
96
- )
97
-
98
- context.sample_top_k(candidates, k: options[:top_k])
99
- context.sample_tail_free(candidates, z: options[:tfs_z])
100
- context.sample_typical(candidates, prob: options[:typical_p])
101
- context.sample_top_p(candidates, prob: options[:top_p])
102
- context.sample_temp(candidates, temp: options[:temp])
103
- id = context.sample_token(candidates)
104
-
105
- last_n_tokens.shift
106
- last_n_tokens.push(id)
107
-
108
- if id == context.model.token_eos
109
- id = context.model.token_nl
110
- unless antiprompt.empty?
111
- first_antiprompt = context.model.tokenize(text: antiprompt, add_bos: false)
112
- embd_input.concat(first_antiprompt)
113
- end
114
- end
115
-
116
- embd.push(id)
117
- input_echo = true
118
- n_remain -= 1
119
- else
120
- while embd_input.size > n_consumed
121
- embd.push(embd_input[n_consumed])
122
- last_n_tokens.shift
123
- last_n_tokens.push(embd_input[n_consumed])
124
- n_consumed += 1
125
- break if embd.size >= options[:batch_size]
126
- end
127
- end
128
-
129
- if input_echo
130
- output = embd.map { |token| context.model.token_to_piece(token) }
131
- output_str = output.join
132
- output_str.chomp!(antiprompt) if first_input
133
- print(output_str)
134
- end
135
-
136
- if embd_input.size <= n_consumed
137
- if antiprompt.size.positive?
138
- last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
139
- last_output_str = last_output.join
140
-
141
- search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
142
- unless last_output_str.index(antiprompt, search_start_pos).nil?
143
- is_interacting = true
144
- true
145
- end
146
- end
147
-
148
- if n_past.positive? && is_interacting
149
- if first_input
150
- print("\r#{antiprompt}")
151
- first_input = false
152
- end
153
- buffer = Readline.readline(' ')
154
- break interactive = false if buffer.nil?
155
-
156
- if buffer.size > 1
157
- line_input = context.model.tokenize(text: "#{buffer}\n", add_bos: false)
158
- embd_input.concat(line_input)
159
- n_remain -= line_input.size
160
- end
161
-
162
- input_echo = false
163
- end
164
-
165
- is_interacting = false if n_past.positive?
166
- end
167
-
168
- if n_remain <= 0 && options[:n_predict] != -1
169
- n_remain = options[:n_predict]
170
- is_interacting = true
171
- end
172
- end
173
- end
174
-
175
- private
176
-
177
- def read_prompt(filename)
178
- return if filename.nil?
179
-
180
- File.read(filename).chomp
181
- end
182
-
183
- def default_prompt(antiprompt)
184
- # Reference:
185
- # https://github.com/ggerganov/llama.cpp/blob/master/prompts/chat-with-bob.txt
186
- prompt = <<~MSG
187
- Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
188
-
189
- User: Hello, Bob.
190
- Bob: Hello. How may I help you today?
191
- User: Please tell me the largest city in Europe.
192
- Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
193
- MSG
194
- prompt + antiprompt
195
- end
196
- end
197
-
198
- Chat.start(ARGV)
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # embedding.rb extracts embedding from prompt.
5
- # It is created with reference to embedding.cpp in llama.cpp examples:
6
- # - https://github.com/ggerganov/llama.cpp/blob/master/examples/embedding/embedding.cpp
7
-
8
- require 'llama_cpp'
9
- require 'thor'
10
- require 'etc'
11
-
12
- class Embedding < Thor # rubocop:disable Style/Documentation
13
- default_command :main
14
- desc 'main', 'Extract embedding from prompt'
15
- option :seed, type: :numeric, aliases: '-s', desc: 'random seed', default: -1
16
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
17
- option :prompt, type: :string, aliases: '-p', desc: 'prompt to generate embedding', required: true
18
- option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
20
- def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
21
- mdl_params = LLaMACpp::ModelParams.new
22
- mdl_params.n_gpu_layers = options[:n_gpu_layers]
23
- model = LLaMACpp::Model.new(model_path: options[:model], params: mdl_params)
24
- ctx_params = LLaMACpp::ContextParams.new
25
- ctx_params.embedding = true
26
- ctx_params.seed = options[:seed] if options[:seed] != -1
27
- ctx_params.n_threads = options[:n_threads]
28
- ctx_params.n_threads_batch = options[:n_threads]
29
- context = LLaMACpp::Context.new(model: model, params: ctx_params)
30
-
31
- embd_input = context.model.tokenize(text: options[:prompt], add_bos: true)
32
-
33
- return unless embd_input.size.positive?
34
-
35
- context.decode(LLaMACpp::Batch.get_one(tokens: embd_input, n_tokens: embd_input.size, pos_zero: 0, seq_id: 0))
36
-
37
- context.embeddings.each { |val| print("#{val} ") }
38
- print("\n")
39
- end
40
- end
41
-
42
- Embedding.start(ARGV)
@@ -1,8 +0,0 @@
1
- UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
- Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
-
4
- User: こんにちには、Taro。
5
- Taro: こんにちは、今日はどのような要件ですか?
6
- User: 日本で最大の都市について教えてください。
7
- Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
8
- User:
data/examples/simple.rb DELETED
@@ -1,96 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- # simple.rb is a simple text completion script.
5
- # It is created with reference to simple.cpp in llama.cpp examples:
6
- # https://github.com/ggerganov/llama.cpp/blob/master/examples/simple/simple.cpp
7
-
8
- require 'llama_cpp'
9
- require 'thor'
10
- require 'etc'
11
-
12
- class Simple < Thor # rubocop:disable Style/Documentation
13
- default_command :main
14
- desc 'main', 'Simple completion'
15
- option :model, type: :string, aliases: '-m', desc: 'path to model file', required: true
16
- option :prompt, type: :string, aliases: '-p', desc: 'prompt to start with', default: 'Hello my name is'
17
- option :n_threads, type: :numeric, desc: 'number of threads', default: Etc.nprocessors
18
- def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
19
- n_len = 32
20
- model_params = LLaMACpp::ModelParams.new
21
- model = LLaMACpp::Model.new(model_path: options[:model], params: model_params)
22
- context_params = LLaMACpp::ContextParams.new
23
- context_params.seed = 1234
24
- context_params.n_ctx = 2048
25
- context_params.logits_all = true
26
- context_params.n_threads = options[:n_threads]
27
- context_params.n_threads_batch = options[:n_threads]
28
- context = LLaMACpp::Context.new(model: model, params: context_params)
29
-
30
- tokens_list = context.model.tokenize(text: options[:prompt], add_bos: true)
31
- n_ctx = context.n_ctx
32
- n_kv_req = tokens_list.size + (n_len - tokens_list.size)
33
- raise 'n_kv_req > n_ctx, the required KV cache size is not big enough' if n_kv_req > n_ctx
34
-
35
- print("\nmain: n_len = #{n_len}, n_ctx = #{n_ctx}, n_kv_req = #{n_kv_req}\n\n")
36
-
37
- tokens_list.each { |token| print(context.model.token_to_piece(token)) }
38
-
39
- batch = LLaMACpp::Batch.new(max_n_token: 512, n_embd: 0, max_n_seq: 1)
40
- tokens_list.each_with_index do |token, id|
41
- batch.set_token(batch.n_tokens, token)
42
- batch.set_pos(batch.n_tokens, id)
43
- batch.set_n_seq_id(batch.n_tokens, 1)
44
- batch.set_seq_id(batch.n_tokens, 0, 0)
45
- batch.set_logits(batch.n_tokens, false)
46
- batch.n_tokens = batch.n_tokens + 1
47
- end
48
-
49
- batch.set_logits(batch.n_tokens - 1, true)
50
-
51
- context.decode(batch)
52
-
53
- n_cur = batch.n_tokens
54
- n_decode = 0
55
- n_vocab = context.model.n_vocab
56
-
57
- t_start = Time.now
58
-
59
- while n_cur <= n_len
60
- logits = context.logits[((batch.n_tokens - 1) * n_vocab)..]
61
-
62
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i] || 0.0, p: 0.0) }
63
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
64
-
65
- new_token_id = context.sample_token_greedy(candidates)
66
-
67
- if new_token_id == context.model.token_eos || n_cur == n_len
68
- print("\n")
69
- break
70
- end
71
-
72
- print(context.model.token_to_piece(new_token_id))
73
-
74
- batch.n_tokens = 0
75
-
76
- batch.set_token(batch.n_tokens, new_token_id)
77
- batch.set_pos(batch.n_tokens, n_cur)
78
- batch.set_n_seq_id(batch.n_tokens, 1)
79
- batch.set_seq_id(batch.n_tokens, 0, 0)
80
- batch.set_logits(batch.n_tokens, true)
81
- batch.n_tokens = batch.n_tokens + 1
82
-
83
- n_decode += 1
84
- n_cur += 1
85
- context.decode(batch)
86
- end
87
-
88
- t_end = Time.now
89
-
90
- print("\nmain: decoded #{n_decode} tokens in #{(t_end - t_start).floor(2)} s, speed: #{n_decode.fdiv(t_end - t_start).floor(2)} t/s\n\n")
91
-
92
- LLaMACpp.backend_free
93
- end
94
- end
95
-
96
- Simple.start(ARGV)