rllama 1.0.0-x86_64-linux-gnu
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +214 -0
- data/bin/rllama +8 -0
- data/lib/rllama/cli.rb +183 -0
- data/lib/rllama/context.rb +233 -0
- data/lib/rllama/cpp.rb +690 -0
- data/lib/rllama/loader.rb +210 -0
- data/lib/rllama/model.rb +103 -0
- data/lib/rllama/version.rb +5 -0
- data/lib/rllama/x86_64-linux/libggml-base.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-alderlake.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-haswell.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-icelake.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sandybridge.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sapphirerapids.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-skylakex.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sse42.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-x64.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-rpc.so +0 -0
- data/lib/rllama/x86_64-linux/libggml.so +0 -0
- data/lib/rllama/x86_64-linux/libllama.so +0 -0
- data/lib/rllama.rb +37 -0
- metadata +79 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 117ba4498d285d83158050dec220dab571fde8abe9f62ca9a1da07c07c1750f2
|
4
|
+
data.tar.gz: 0e2a7bc873398ad490d3f90b9488b808bd197544c98747f916be9d7d9914ce1f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 128cde36b03ced401b2d7f3a9ceb84c00ef91381368e4f9b8eba2714e99af4d552e102d8a23f300440e25b20712f2b81233f8c254ec44cb82232a88fd84093ca
|
7
|
+
data.tar.gz: 77609e417bae1d4c2b8d9e2351ce753b1adefe03f81ca761a3ad59013fecb856aaa7d1ab1c6e2a2f9a9031a057cd474f1f1c63b31d3d5dfa7802efdd636f73d7
|
data/README.md
ADDED
@@ -0,0 +1,214 @@
|
|
1
|
+
# Rllama
|
2
|
+
|
3
|
+
Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'rllama'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
```bash
|
16
|
+
bundle install
|
17
|
+
```
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
```bash
|
22
|
+
gem install rllama
|
23
|
+
```
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
### Text Generation
|
28
|
+
|
29
|
+
Generate text completions using local language models:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'rllama'
|
33
|
+
|
34
|
+
# Load a model
|
35
|
+
model = Rllama.load_model('lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf')
|
36
|
+
|
37
|
+
# Generate text
|
38
|
+
result = model.generate('What is the capital of France?')
|
39
|
+
puts result.text
|
40
|
+
# => "The capital of France is Paris."
|
41
|
+
|
42
|
+
# Access generation statistics
|
43
|
+
puts "Tokens generated: #{result.stats[:tokens_generated]}"
|
44
|
+
puts "Tokens per second: #{result.stats[:tps]}"
|
45
|
+
puts "Duration: #{result.stats[:duration]} seconds"
|
46
|
+
|
47
|
+
# Don't forget to close the model when done
|
48
|
+
model.close
|
49
|
+
```
|
50
|
+
|
51
|
+
#### Generation parameters
|
52
|
+
|
53
|
+
Adjust the generation with parameters:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
result = model.generate(
|
57
|
+
'Write a short poem about Ruby programming',
|
58
|
+
max_tokens: 2024,
|
59
|
+
temperature: 0.8,
|
60
|
+
top_k: 40,
|
61
|
+
top_p: 0.95,
|
62
|
+
min_p: 0.05
|
63
|
+
)
|
64
|
+
```
|
65
|
+
|
66
|
+
#### Streaming generation
|
67
|
+
|
68
|
+
Stream generated text token-by-token:
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
model.generate('Explain quantum computing') do |token|
|
72
|
+
print token
|
73
|
+
end
|
74
|
+
```
|
75
|
+
|
76
|
+
#### System prompt
|
77
|
+
|
78
|
+
Include system promt to guide model behavior:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
result = model.generate(
|
82
|
+
'What are best practices for Ruby development?',
|
83
|
+
system: 'You are an expert Ruby developer with 10 years of experience.'
|
84
|
+
)
|
85
|
+
```
|
86
|
+
|
87
|
+
#### Messages list
|
88
|
+
|
89
|
+
Pass multiple messages with roles for more complex interactions:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
result = model.generate([
|
93
|
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
94
|
+
{ role: 'user', content: 'What is the capital of France?' },
|
95
|
+
{ role: 'assistant', content: 'The capital of France is Paris.' },
|
96
|
+
{ role: 'user', content: 'What is its population?' }
|
97
|
+
])
|
98
|
+
puts result.text
|
99
|
+
```
|
100
|
+
|
101
|
+
### Chat
|
102
|
+
|
103
|
+
For ongoing conversations, use a context object that maintains the conversation history:
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
# Initialize a chat context
|
107
|
+
context = model.init_context
|
108
|
+
|
109
|
+
# Send messages and maintain conversation history
|
110
|
+
response1 = context.message('What is the capital of France?')
|
111
|
+
puts response1.text
|
112
|
+
# => "The capital of France is Paris."
|
113
|
+
|
114
|
+
response2 = context.message('What is the population of that city?')
|
115
|
+
puts response2.text
|
116
|
+
# => "Paris has a population of approximately 2.1 million people..."
|
117
|
+
|
118
|
+
response3 = context.message('What was my first message?')
|
119
|
+
puts response3.text
|
120
|
+
# => "Your first message was asking about the capital of France."
|
121
|
+
|
122
|
+
# The context remembers all previous messages in the conversation
|
123
|
+
|
124
|
+
# Close context when done
|
125
|
+
context.close
|
126
|
+
```
|
127
|
+
|
128
|
+
### Embeddings
|
129
|
+
|
130
|
+
Generate vector embeddings for text using embedding models:
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
require 'rllama'
|
134
|
+
|
135
|
+
# Load an embedding model
|
136
|
+
model = Rllama.load_model('lmstudio-community/embeddinggemma-300m-qat-GGUF/embeddinggemma-300m-qat-Q4_0.gguf')
|
137
|
+
|
138
|
+
# Generate embedding for a single text
|
139
|
+
embedding = model.embed('Hello, world!')
|
140
|
+
puts embedding.length
|
141
|
+
# => 724 (depending on your model)
|
142
|
+
|
143
|
+
# Generate embeddings for multiple sentences
|
144
|
+
embeddings = model.embed([
|
145
|
+
'roses are red',
|
146
|
+
'violets are blue',
|
147
|
+
'sugar is sweet'
|
148
|
+
])
|
149
|
+
|
150
|
+
puts embeddings.length
|
151
|
+
# => 3
|
152
|
+
puts embeddings[0].length
|
153
|
+
# => 768
|
154
|
+
|
155
|
+
model.close
|
156
|
+
```
|
157
|
+
|
158
|
+
#### Vector parameters
|
159
|
+
|
160
|
+
By default, embedding vectors are normalized. You can disable normalization with `normalize: false`:
|
161
|
+
|
162
|
+
```ruby
|
163
|
+
# Generate unnormalized embeddings
|
164
|
+
embedding = model.embed('Sample text', normalize: false)
|
165
|
+
|
166
|
+
# Use custom batch size for processing multiple texts
|
167
|
+
embeddings = model.embed(
|
168
|
+
['roses are red', 'violets are blue', 'sugar is sweet'],
|
169
|
+
normalize: true
|
170
|
+
)
|
171
|
+
```
|
172
|
+
|
173
|
+
## CLI Chat Utility
|
174
|
+
|
175
|
+
The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
|
176
|
+
|
177
|
+
```bash
|
178
|
+
rllama
|
179
|
+
```
|
180
|
+
|
181
|
+
When you run `rllama` without arguments, it will display:
|
182
|
+
|
183
|
+
- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
|
184
|
+
- **Popular models**: A curated list of popular models available for download, including:
|
185
|
+
- Gemma 3 1B
|
186
|
+
- Llama 3.2 3B
|
187
|
+
- Phi-4
|
188
|
+
- Qwen3 30B
|
189
|
+
- GPT-OSS
|
190
|
+
- And more...
|
191
|
+
|
192
|
+
Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
|
193
|
+
|
194
|
+
You can also specify a model path directly:
|
195
|
+
|
196
|
+
```bash
|
197
|
+
rllama path/to/your/model.gguf
|
198
|
+
```
|
199
|
+
|
200
|
+
Once the model loads, you can start chatting.
|
201
|
+
|
202
|
+
## Finding Models
|
203
|
+
|
204
|
+
You can download GGUF format models from various sources:
|
205
|
+
|
206
|
+
- [Hugging Face](https://huggingface.co/models?library=gguf) - Search for models with "GGUF" format
|
207
|
+
|
208
|
+
## License
|
209
|
+
|
210
|
+
MIT
|
211
|
+
|
212
|
+
## Contributing
|
213
|
+
|
214
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/docusealco/rllama.
|
data/bin/rllama
ADDED
data/lib/rllama/cli.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'readline'
|
4
|
+
|
5
|
+
module Rllama
|
6
|
+
class Cli
|
7
|
+
POPULAR_MODELS = [
|
8
|
+
{ path: 'lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf', size: 720_425_472 },
|
9
|
+
{ path: 'lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf', size: 12_109_565_632 },
|
10
|
+
{ path: 'bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf', size: 2_019_377_696 },
|
11
|
+
{ path: 'unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-Q3_K_S.gguf', size: 13_292_468_800 },
|
12
|
+
{ path: 'inclusionAI/Ling-mini-2.0-GGUF/Ling-mini-2.0-Q4_K_M.gguf', size: 9_911_575_072 },
|
13
|
+
{ path: 'unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-Q4_K_S.gguf', size: 4_404_697_216 },
|
14
|
+
{ path: 'microsoft/phi-4-gguf/phi-4-Q4_K_S.gguf', size: 8_440_762_560 }
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
COLOR_CODES = {
|
18
|
+
red: 31,
|
19
|
+
green: 32,
|
20
|
+
yellow: 33,
|
21
|
+
blue: 34,
|
22
|
+
magenta: 35,
|
23
|
+
cyan: 36
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
def self.start(args)
|
27
|
+
new(args).run
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize(args)
|
31
|
+
@args = args
|
32
|
+
|
33
|
+
@model_path = args.first
|
34
|
+
end
|
35
|
+
|
36
|
+
def run
|
37
|
+
model_path = select_or_load_model
|
38
|
+
|
39
|
+
puts "\n#{colorize('Loading model...', :yellow)}"
|
40
|
+
|
41
|
+
model = Rllama.load_model(model_path)
|
42
|
+
context = model.init_context
|
43
|
+
|
44
|
+
puts colorize('Model loaded successfully!', :green)
|
45
|
+
puts "\n#{colorize('Chat started. Type your message and press Enter. Type "exit" or "quit" to end the chat.',
|
46
|
+
:cyan)}\n\n"
|
47
|
+
|
48
|
+
chat_loop(context)
|
49
|
+
rescue Interrupt
|
50
|
+
puts "\n\n#{colorize('Chat interrupted. Goodbye!', :yellow)}"
|
51
|
+
exit(0)
|
52
|
+
rescue StandardError => e
|
53
|
+
puts "\n#{colorize("Error: #{e.message}", :red)}"
|
54
|
+
exit(1)
|
55
|
+
ensure
|
56
|
+
context&.close
|
57
|
+
model&.close
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def select_or_load_model
|
63
|
+
return @model_path if @model_path
|
64
|
+
|
65
|
+
downloaded_models = find_downloaded_models
|
66
|
+
downloaded_model_names = downloaded_models.map { |path| File.basename(path, '.gguf') }
|
67
|
+
|
68
|
+
available_popular = POPULAR_MODELS.reject do |popular_model|
|
69
|
+
popular_filename = File.basename(popular_model[:path], '.gguf')
|
70
|
+
downloaded_model_names.any?(popular_filename)
|
71
|
+
end
|
72
|
+
|
73
|
+
all_choices = []
|
74
|
+
current_index = 1
|
75
|
+
|
76
|
+
unless downloaded_models.empty?
|
77
|
+
puts "#{colorize('Downloaded models:', :cyan)}\n\n"
|
78
|
+
|
79
|
+
downloaded_models.each do |model|
|
80
|
+
display_name = File.basename(model, '.gguf')
|
81
|
+
size = format_file_size(File.size(model))
|
82
|
+
puts " #{colorize(current_index.to_s, :green)}. #{display_name} #{colorize("(#{size})", :yellow)}"
|
83
|
+
all_choices << model
|
84
|
+
current_index += 1
|
85
|
+
end
|
86
|
+
|
87
|
+
puts "\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
unless available_popular.empty?
|
91
|
+
puts "#{colorize('Popular models (not downloaded):', :cyan)}\n\n"
|
92
|
+
|
93
|
+
available_popular.each do |model|
|
94
|
+
display_name = File.basename(model[:path], '.gguf')
|
95
|
+
puts " #{colorize(current_index.to_s, :green)}. " \
|
96
|
+
"#{display_name} #{colorize("(#{format_file_size(model[:size])})", :yellow)}"
|
97
|
+
all_choices << model[:path]
|
98
|
+
current_index += 1
|
99
|
+
end
|
100
|
+
|
101
|
+
puts "\n"
|
102
|
+
end
|
103
|
+
|
104
|
+
if all_choices.empty?
|
105
|
+
puts colorize('No models available', :yellow)
|
106
|
+
exit(1)
|
107
|
+
end
|
108
|
+
|
109
|
+
print colorize("Enter number (1-#{all_choices.length}): ", :cyan)
|
110
|
+
|
111
|
+
choice = $stdin.gets&.strip.to_i
|
112
|
+
|
113
|
+
if choice < 1 || choice > all_choices.length
|
114
|
+
puts colorize('Invalid choice', :red)
|
115
|
+
|
116
|
+
exit(1)
|
117
|
+
end
|
118
|
+
|
119
|
+
all_choices[choice - 1]
|
120
|
+
end
|
121
|
+
|
122
|
+
def find_downloaded_models
|
123
|
+
models_dir = File.join(Dir.home, '.rllama', 'models')
|
124
|
+
|
125
|
+
return [] unless Dir.exist?(models_dir)
|
126
|
+
|
127
|
+
Dir.glob(File.join(models_dir, '**', '*.gguf')).reject do |path|
|
128
|
+
basename = File.basename(path)
|
129
|
+
|
130
|
+
basename.start_with?('~', '!')
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def format_file_size(bytes)
|
135
|
+
gb = bytes / (1024.0**3)
|
136
|
+
|
137
|
+
if gb >= 1.0
|
138
|
+
format('%.1fGB', gb)
|
139
|
+
else
|
140
|
+
mb = bytes / (1024.0**2)
|
141
|
+
|
142
|
+
format('%dMB', mb.round)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def chat_loop(context)
|
147
|
+
loop do
|
148
|
+
user_input = Readline.readline('> ', false)&.strip
|
149
|
+
|
150
|
+
break if user_input.nil?
|
151
|
+
|
152
|
+
next if user_input.empty?
|
153
|
+
|
154
|
+
if user_input.downcase == 'exit' || user_input.downcase == 'quit'
|
155
|
+
puts "\n#{colorize('Goodbye!', :yellow)}"
|
156
|
+
|
157
|
+
break
|
158
|
+
end
|
159
|
+
|
160
|
+
puts "\n"
|
161
|
+
|
162
|
+
print "#{colorize('Assistant:', :magenta, bold: true)} "
|
163
|
+
|
164
|
+
context.generate(user_input) do |token|
|
165
|
+
print token
|
166
|
+
$stdout.flush
|
167
|
+
end
|
168
|
+
|
169
|
+
puts "\n\n"
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def colorize(text, color, bold: false)
|
174
|
+
return text unless $stdout.tty?
|
175
|
+
|
176
|
+
code = COLOR_CODES[color] || 37
|
177
|
+
|
178
|
+
prefix = bold ? "\e[1;#{code}m" : "\e[#{code}m"
|
179
|
+
|
180
|
+
"#{prefix}#{text}\e[0m"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Rllama
|
4
|
+
class Context
|
5
|
+
attr_reader :messages, :n_ctx, :n_batch, :n_past
|
6
|
+
|
7
|
+
def initialize(model, embeddings: false, n_ctx: nil, n_batch: nil)
|
8
|
+
@model = model
|
9
|
+
@n_ctx = n_ctx
|
10
|
+
@n_batch = n_batch
|
11
|
+
@embeddings = embeddings
|
12
|
+
|
13
|
+
@ctx_params = Cpp.llama_context_default_params
|
14
|
+
|
15
|
+
@ctx_params[:n_ctx] = @n_ctx
|
16
|
+
@ctx_params[:n_batch] = @n_batch
|
17
|
+
|
18
|
+
if @embeddings
|
19
|
+
@ctx_params[:n_seq_max] = [@n_batch, @model.n_seq_max].min
|
20
|
+
@ctx_params[:embeddings] = true
|
21
|
+
end
|
22
|
+
|
23
|
+
@pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
|
24
|
+
|
25
|
+
raise Error, 'Failed to create the llama_context' if @pointer.null?
|
26
|
+
|
27
|
+
@n_past = 0
|
28
|
+
@messages = []
|
29
|
+
end
|
30
|
+
|
31
|
+
def generate(message, role: 'user', max_tokens: @n_ctx, temperature: 0.8, top_k: 40, top_p: 0.95, min_p: 0.05,
|
32
|
+
seed: nil, system: nil)
|
33
|
+
@messages << { role: 'system', content: system } if system && @messages.empty?
|
34
|
+
|
35
|
+
if message.is_a?(Array)
|
36
|
+
@messages.push(*message)
|
37
|
+
elsif message.is_a?(Hash)
|
38
|
+
@messages.push(message)
|
39
|
+
else
|
40
|
+
@messages << { role: role, content: message }
|
41
|
+
end
|
42
|
+
|
43
|
+
prompt_string = @model.build_chat_template(@messages)
|
44
|
+
|
45
|
+
n_prompt_tokens = -Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, nil, 0, true, true)
|
46
|
+
|
47
|
+
raise Error, 'Prompt is too long.' if n_prompt_tokens.negative?
|
48
|
+
|
49
|
+
prompt_tokens_ptr = FFI::MemoryPointer.new(:int32, n_prompt_tokens)
|
50
|
+
tokens_written = Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, prompt_tokens_ptr,
|
51
|
+
n_prompt_tokens, true, true)
|
52
|
+
|
53
|
+
raise Error, 'Failed to tokenize prompt.' if tokens_written.negative?
|
54
|
+
|
55
|
+
new_token_count = tokens_written - @n_past
|
56
|
+
|
57
|
+
if new_token_count.positive?
|
58
|
+
new_tokens_ptr = prompt_tokens_ptr + (@n_past * FFI.type_size(:int32))
|
59
|
+
|
60
|
+
batch = Cpp.llama_batch_get_one(new_tokens_ptr, new_token_count)
|
61
|
+
|
62
|
+
raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
|
63
|
+
|
64
|
+
@n_past = tokens_written
|
65
|
+
end
|
66
|
+
|
67
|
+
chain_params = Cpp.llama_sampler_chain_default_params
|
68
|
+
sampler_chain = Cpp.llama_sampler_chain_init(chain_params)
|
69
|
+
|
70
|
+
Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_min_p(min_p, 1)) if min_p
|
71
|
+
Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_k(top_k)) if top_k&.positive?
|
72
|
+
Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_p(top_p, 1)) if top_p && top_p < 1.0
|
73
|
+
if temperature&.positive?
|
74
|
+
Cpp.llama_sampler_chain_add(sampler_chain,
|
75
|
+
Cpp.llama_sampler_init_temp(temperature))
|
76
|
+
end
|
77
|
+
|
78
|
+
is_probabilistic = temperature&.positive? || top_k&.positive? || (top_p && top_p < 1.0) || !min_p.nil?
|
79
|
+
rng_seed = seed || (Random.new_seed & 0xFFFFFFFF)
|
80
|
+
|
81
|
+
if is_probabilistic
|
82
|
+
Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_dist(rng_seed))
|
83
|
+
else
|
84
|
+
Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_greedy)
|
85
|
+
end
|
86
|
+
|
87
|
+
n_decoded = 0
|
88
|
+
|
89
|
+
generated_text = ''.b
|
90
|
+
|
91
|
+
assistant_message = { role: 'assistant', content: generated_text }
|
92
|
+
|
93
|
+
@messages << assistant_message
|
94
|
+
|
95
|
+
start_time = Time.now
|
96
|
+
|
97
|
+
loop do
|
98
|
+
break if n_decoded >= max_tokens
|
99
|
+
|
100
|
+
new_token_id = Cpp.llama_sampler_sample(sampler_chain, @pointer, -1)
|
101
|
+
|
102
|
+
break if Cpp.llama_vocab_is_eog(@model.vocab, new_token_id)
|
103
|
+
|
104
|
+
buffer = FFI::MemoryPointer.new(:char, 256)
|
105
|
+
n_chars = Cpp.llama_token_to_piece(@model.vocab, new_token_id, buffer, buffer.size, 0, true)
|
106
|
+
|
107
|
+
if n_chars >= 0
|
108
|
+
piece_bytes = buffer.read_string(n_chars)
|
109
|
+
utf8_piece = piece_bytes.force_encoding(Encoding::UTF_8)
|
110
|
+
generated_text << utf8_piece
|
111
|
+
yield utf8_piece if block_given?
|
112
|
+
end
|
113
|
+
|
114
|
+
token_ptr = FFI::MemoryPointer.new(:int32, 1).put_int32(0, new_token_id)
|
115
|
+
batch = Cpp.llama_batch_get_one(token_ptr, 1)
|
116
|
+
|
117
|
+
raise Error, 'context length has been exceeded' if @n_past >= @n_ctx
|
118
|
+
raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
|
119
|
+
|
120
|
+
@n_past += 1
|
121
|
+
n_decoded += 1
|
122
|
+
end
|
123
|
+
|
124
|
+
end_time = Time.now
|
125
|
+
|
126
|
+
duration = end_time - start_time
|
127
|
+
|
128
|
+
tps = n_decoded.positive? && duration.positive? ? n_decoded / duration : 0
|
129
|
+
|
130
|
+
Cpp.llama_sampler_free(sampler_chain)
|
131
|
+
|
132
|
+
Result.new(
|
133
|
+
text: generated_text,
|
134
|
+
stats: {
|
135
|
+
duration:,
|
136
|
+
tokens_generated: n_decoded,
|
137
|
+
tps:,
|
138
|
+
seed: rng_seed
|
139
|
+
}
|
140
|
+
)
|
141
|
+
end
|
142
|
+
alias message generate
|
143
|
+
|
144
|
+
def embed(strings, normalize: true, batch_size: 512)
|
145
|
+
is_array = strings.is_a?(Array)
|
146
|
+
|
147
|
+
strings = Array(strings) unless is_array
|
148
|
+
|
149
|
+
tokenized_strings = strings.map do |text|
|
150
|
+
max_tokens = text.bytesize + 2
|
151
|
+
tokens_ptr = FFI::MemoryPointer.new(:int32, max_tokens)
|
152
|
+
count = Cpp.llama_tokenize(@model.vocab, text, text.bytesize, tokens_ptr, max_tokens, true, false)
|
153
|
+
|
154
|
+
raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
|
155
|
+
|
156
|
+
tokens_ptr.read_array_of_int32(count)
|
157
|
+
end
|
158
|
+
|
159
|
+
all_embeddings = []
|
160
|
+
batch = Cpp.llama_batch_init(batch_size, 0, 1)
|
161
|
+
prompts_in_batch = []
|
162
|
+
current_batch_token_count = 0
|
163
|
+
|
164
|
+
process_batch = lambda do
|
165
|
+
next if prompts_in_batch.empty?
|
166
|
+
|
167
|
+
batch[:n_tokens] = current_batch_token_count
|
168
|
+
|
169
|
+
raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
|
170
|
+
|
171
|
+
prompts_in_batch.each do |seq_id_in_batch|
|
172
|
+
embd_ptr = Cpp.llama_get_embeddings_seq(@pointer, seq_id_in_batch)
|
173
|
+
|
174
|
+
raise Error, 'Failed to get embedding' if embd_ptr.null?
|
175
|
+
|
176
|
+
embedding = embd_ptr.read_array_of_float(@model.n_embd)
|
177
|
+
|
178
|
+
all_embeddings << (normalize ? normalize_embedding(embedding) : embedding)
|
179
|
+
end
|
180
|
+
|
181
|
+
current_batch_token_count = 0
|
182
|
+
prompts_in_batch = []
|
183
|
+
end
|
184
|
+
|
185
|
+
tokenized_strings.each do |tokens|
|
186
|
+
batch_full = (current_batch_token_count + tokens.size) > batch_size
|
187
|
+
seq_limit_reached = prompts_in_batch.size >= @model.n_seq_max
|
188
|
+
process_batch.call if !prompts_in_batch.empty? && (batch_full || seq_limit_reached)
|
189
|
+
|
190
|
+
seq_id = prompts_in_batch.size
|
191
|
+
prompts_in_batch << seq_id
|
192
|
+
|
193
|
+
tokens.each_with_index do |token_id, pos|
|
194
|
+
idx = current_batch_token_count
|
195
|
+
|
196
|
+
batch[:token].put_int32(idx * FFI.type_size(:int32), token_id)
|
197
|
+
batch[:pos].put_int32(idx * FFI.type_size(:int32), pos)
|
198
|
+
batch[:n_seq_id].put_int32(idx * FFI.type_size(:int32), 1)
|
199
|
+
batch[:seq_id].get_pointer(idx * FFI::Pointer.size).put_int32(0, seq_id)
|
200
|
+
batch[:logits].put_int8(idx, pos == tokens.size - 1 ? 1 : 0)
|
201
|
+
|
202
|
+
current_batch_token_count += 1
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
process_batch.call
|
207
|
+
|
208
|
+
Cpp.llama_batch_free(batch)
|
209
|
+
|
210
|
+
is_array ? all_embeddings : all_embeddings[0]
|
211
|
+
end
|
212
|
+
|
213
|
+
def embeddings?
|
214
|
+
@embeddings
|
215
|
+
end
|
216
|
+
|
217
|
+
def close
|
218
|
+
Cpp.llama_free(@pointer)
|
219
|
+
end
|
220
|
+
|
221
|
+
def norm(vec)
|
222
|
+
Math.sqrt(vec.sum { |x| x**2 })
|
223
|
+
end
|
224
|
+
|
225
|
+
def normalize_embedding(vec)
|
226
|
+
n = norm(vec)
|
227
|
+
|
228
|
+
return vec if n.zero?
|
229
|
+
|
230
|
+
vec.map { |x| x / n }
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|