rllama 1.0.0-x64-mingw-ucrt → 1.0.2-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +34 -35
- data/lib/rllama/context.rb +39 -14
- data/lib/rllama/cpp.rb +26 -5
- data/lib/rllama/loader.rb +4 -2
- data/lib/rllama/model.rb +24 -3
- data/lib/rllama/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d385047c3570bba997c66148e0f512830945972ac071ac202f16a4be051376f3
|
4
|
+
data.tar.gz: b99783d0625064fd94091dfa8ddc8797b14019776d1d00a6e059a182af1fcb96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b96e29c7fd372b81a64f4b14b07228c38296c44b79e60f9f1ade1fe9a3f2a726744124599593b7588e7698a850413006d87fd5668244b3ef7a0b7d5dbc3bc9c
|
7
|
+
data.tar.gz: 32e45805d8d6f2ce85c230273a421db70d2e4e251dd9a2fbbf0eff69119e628a7848dfab9624705434829e08089d13f5e0ebbd8bb98233a411d605a91ccec98b
|
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
<img width="336" height="212.0" alt="Logo" src="https://github.com/user-attachments/assets/e27442fb-22d1-44cf-ba3d-f10b24c13652" />
|
2
|
+
|
1
3
|
# Rllama
|
2
4
|
|
3
5
|
Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
|
@@ -22,6 +24,38 @@ Or install it yourself as:
|
|
22
24
|
gem install rllama
|
23
25
|
```
|
24
26
|
|
27
|
+
## CLI Chat
|
28
|
+
|
29
|
+
The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
|
30
|
+
|
31
|
+
```bash
|
32
|
+
rllama
|
33
|
+
```
|
34
|
+
|
35
|
+
When you run `rllama` without arguments, it will display:
|
36
|
+
|
37
|
+
- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
|
38
|
+
- **Popular models**: A curated list of popular models available for download, including:
|
39
|
+
- Gemma 3 1B
|
40
|
+
- Llama 3.2 3B
|
41
|
+
- Phi-4
|
42
|
+
- Qwen3 30B
|
43
|
+
- GPT-OSS
|
44
|
+
|
45
|
+
Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
|
46
|
+
|
47
|
+
You can also specify a model path or URL directly:
|
48
|
+
|
49
|
+
```bash
|
50
|
+
rllama path/to/your/model.gguf
|
51
|
+
```
|
52
|
+
|
53
|
+
```bash
|
54
|
+
rllama https://huggingface.co/microsoft/phi-4-gguf/resolve/main/phi-4-Q3_K_S.gguf
|
55
|
+
```
|
56
|
+
|
57
|
+
Once the model has loaded, you can start chatting.
|
58
|
+
|
25
59
|
## Usage
|
26
60
|
|
27
61
|
### Text Generation
|
@@ -162,43 +196,8 @@ By default, embedding vectors are normalized. You can disable normalization with
|
|
162
196
|
```ruby
|
163
197
|
# Generate unnormalized embeddings
|
164
198
|
embedding = model.embed('Sample text', normalize: false)
|
165
|
-
|
166
|
-
# Use custom batch size for processing multiple texts
|
167
|
-
embeddings = model.embed(
|
168
|
-
['roses are red', 'violets are blue', 'sugar is sweet'],
|
169
|
-
normalize: true
|
170
|
-
)
|
171
|
-
```
|
172
|
-
|
173
|
-
## CLI Chat Utility
|
174
|
-
|
175
|
-
The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
|
176
|
-
|
177
|
-
```bash
|
178
|
-
rllama
|
179
|
-
```
|
180
|
-
|
181
|
-
When you run `rllama` without arguments, it will display:
|
182
|
-
|
183
|
-
- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
|
184
|
-
- **Popular models**: A curated list of popular models available for download, including:
|
185
|
-
- Gemma 3 1B
|
186
|
-
- Llama 3.2 3B
|
187
|
-
- Phi-4
|
188
|
-
- Qwen3 30B
|
189
|
-
- GPT-OSS
|
190
|
-
- And more...
|
191
|
-
|
192
|
-
Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
|
193
|
-
|
194
|
-
You can also specify a model path directly:
|
195
|
-
|
196
|
-
```bash
|
197
|
-
rllama path/to/your/model.gguf
|
198
199
|
```
|
199
200
|
|
200
|
-
Once the model loads, you can start chatting.
|
201
|
-
|
202
201
|
## Finding Models
|
203
202
|
|
204
203
|
You can download GGUF format models from various sources:
|
data/lib/rllama/context.rb
CHANGED
@@ -12,12 +12,21 @@ module Rllama
|
|
12
12
|
|
13
13
|
@ctx_params = Cpp.llama_context_default_params
|
14
14
|
|
15
|
-
@ctx_params[:n_ctx] = @n_ctx
|
16
|
-
@ctx_params[:n_batch] = @n_batch
|
15
|
+
@ctx_params[:n_ctx] = @n_ctx if @n_ctx
|
16
|
+
@ctx_params[:n_batch] = @n_batch if @n_batch
|
17
17
|
|
18
18
|
if @embeddings
|
19
|
-
|
19
|
+
seq_cap = @model.n_seq_max
|
20
|
+
|
21
|
+
if @n_batch&.positive? && seq_cap&.positive?
|
22
|
+
@ctx_params[:n_seq_max] = [@n_batch, seq_cap].min
|
23
|
+
elsif seq_cap&.positive?
|
24
|
+
@ctx_params[:n_seq_max] = seq_cap
|
25
|
+
end
|
26
|
+
|
20
27
|
@ctx_params[:embeddings] = true
|
28
|
+
@ctx_params[:kv_unified] = true
|
29
|
+
@ctx_params[:n_ubatch] = @n_batch if @n_batch&.positive?
|
21
30
|
end
|
22
31
|
|
23
32
|
@pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
|
@@ -141,19 +150,31 @@ module Rllama
|
|
141
150
|
end
|
142
151
|
alias message generate
|
143
152
|
|
144
|
-
def embed(
|
145
|
-
|
153
|
+
def embed(strings_or_tokens, normalize: true, batch_size: 512)
|
154
|
+
is_tokens = strings_or_tokens.is_a?(Array) &&
|
155
|
+
(strings_or_tokens[0].is_a?(Integer) ||
|
156
|
+
(strings_or_tokens[0].is_a?(Array) && strings_or_tokens[0][0].is_a?(Integer)))
|
146
157
|
|
147
|
-
|
158
|
+
input_is_array = is_tokens ? strings_or_tokens[0].is_a?(Array) : strings_or_tokens.is_a?(Array)
|
148
159
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
160
|
+
normalized_inputs = input_is_array ? strings_or_tokens : [strings_or_tokens]
|
161
|
+
|
162
|
+
tokenized_strings =
|
163
|
+
if is_tokens
|
164
|
+
input_is_array ? strings_or_tokens : [strings_or_tokens]
|
165
|
+
else
|
166
|
+
normalized_inputs.map { |text| @model.tokenize(text) }
|
167
|
+
end
|
153
168
|
|
154
|
-
|
169
|
+
max_tokens_in_prompt = tokenized_strings.map(&:length).max || 0
|
155
170
|
|
156
|
-
|
171
|
+
if max_tokens_in_prompt > batch_size
|
172
|
+
raise Error, "batch_size (#{batch_size}) is smaller than the longest prompt (#{max_tokens_in_prompt} tokens)."
|
173
|
+
end
|
174
|
+
|
175
|
+
if max_tokens_in_prompt > @n_batch
|
176
|
+
raise Error, "Context n_batch (#{@n_batch}) is smaller than the longest " \
|
177
|
+
"prompt (#{max_tokens_in_prompt} tokens). Increase batch_size when calling embed."
|
157
178
|
end
|
158
179
|
|
159
180
|
all_embeddings = []
|
@@ -166,6 +187,9 @@ module Rllama
|
|
166
187
|
|
167
188
|
batch[:n_tokens] = current_batch_token_count
|
168
189
|
|
190
|
+
memory_ptr = Cpp.llama_get_memory(@pointer)
|
191
|
+
Cpp.llama_memory_clear(memory_ptr, true) unless memory_ptr.null?
|
192
|
+
|
169
193
|
raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
|
170
194
|
|
171
195
|
prompts_in_batch.each do |seq_id_in_batch|
|
@@ -179,7 +203,8 @@ module Rllama
|
|
179
203
|
end
|
180
204
|
|
181
205
|
current_batch_token_count = 0
|
182
|
-
|
206
|
+
|
207
|
+
prompts_in_batch.clear
|
183
208
|
end
|
184
209
|
|
185
210
|
tokenized_strings.each do |tokens|
|
@@ -207,7 +232,7 @@ module Rllama
|
|
207
232
|
|
208
233
|
Cpp.llama_batch_free(batch)
|
209
234
|
|
210
|
-
|
235
|
+
input_is_array ? all_embeddings : all_embeddings[0]
|
211
236
|
end
|
212
237
|
|
213
238
|
def embeddings?
|
data/lib/rllama/cpp.rb
CHANGED
@@ -8,7 +8,7 @@ module Rllama
|
|
8
8
|
|
9
9
|
LIB_NAME = 'llama'
|
10
10
|
|
11
|
-
|
11
|
+
PLATFORM =
|
12
12
|
case FFI::Platform::OS
|
13
13
|
when 'darwin'
|
14
14
|
FFI::Platform::ARCH == 'aarch64' ? 'arm64-darwin' : 'x86_64-darwin'
|
@@ -28,12 +28,27 @@ module Rllama
|
|
28
28
|
"lib#{LIB_NAME}.so"
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
|
-
|
31
|
+
PLATFORM_DIR = File.join(__dir__, PLATFORM)
|
32
|
+
|
33
|
+
platform_path = File.join(PLATFORM_DIR, lib_file)
|
33
34
|
|
34
35
|
lib_paths = []
|
36
|
+
|
35
37
|
lib_paths << platform_path if File.exist?(platform_path)
|
36
38
|
|
39
|
+
ggml_lib_file =
|
40
|
+
case FFI::Platform::OS
|
41
|
+
when 'darwin'
|
42
|
+
'libggml.dylib'
|
43
|
+
when 'windows', 'mingw32'
|
44
|
+
'ggml.dll'
|
45
|
+
else
|
46
|
+
'libggml.so'
|
47
|
+
end
|
48
|
+
|
49
|
+
ggml_platform_path = File.join(PLATFORM_DIR, ggml_lib_file)
|
50
|
+
lib_paths << ggml_platform_path if File.exist?(ggml_platform_path)
|
51
|
+
|
37
52
|
lib_paths +=
|
38
53
|
case FFI::Platform::OS
|
39
54
|
when 'darwin'
|
@@ -436,6 +451,8 @@ module Rllama
|
|
436
451
|
attach_function :llama_backend_init, [], :void
|
437
452
|
attach_function :llama_backend_free, [], :void
|
438
453
|
attach_function :llama_numa_init, [:int], :void # ggml_numa_strategy
|
454
|
+
attach_function :ggml_backend_load_all, [], :void
|
455
|
+
attach_function :ggml_backend_load_all_from_path, [:string], :void
|
439
456
|
|
440
457
|
# Threadpool
|
441
458
|
attach_function :llama_attach_threadpool, %i[llama_context_p ggml_threadpool_t ggml_threadpool_t], :void
|
@@ -681,10 +698,14 @@ module Rllama
|
|
681
698
|
llama_log_set(@log_callback, nil)
|
682
699
|
end
|
683
700
|
|
684
|
-
llama_backend_init
|
685
|
-
|
686
701
|
silence_log!
|
687
702
|
|
703
|
+
if File.directory?(PLATFORM_DIR)
|
704
|
+
ggml_backend_load_all_from_path(PLATFORM_DIR)
|
705
|
+
else
|
706
|
+
ggml_backend_load_all
|
707
|
+
end
|
708
|
+
|
688
709
|
freeze
|
689
710
|
end
|
690
711
|
end
|
data/lib/rllama/loader.rb
CHANGED
@@ -62,6 +62,8 @@ module Rllama
|
|
62
62
|
|
63
63
|
local_path = File.join(dir, org, repo, file_path)
|
64
64
|
|
65
|
+
return local_path if File.exist?(local_path)
|
66
|
+
|
65
67
|
puts "Destination: #{local_path}"
|
66
68
|
|
67
69
|
download_file(url, local_path, "HuggingFace model: #{hf_path}")
|
@@ -74,6 +76,8 @@ module Rllama
|
|
74
76
|
|
75
77
|
local_path = File.join(dir, filename)
|
76
78
|
|
79
|
+
return local_path if File.exist?(local_path)
|
80
|
+
|
77
81
|
puts "Destination: #{local_path}"
|
78
82
|
|
79
83
|
download_file(url, local_path, "URL: #{url}")
|
@@ -82,8 +86,6 @@ module Rllama
|
|
82
86
|
def download_file(url, local_path, description)
|
83
87
|
FileUtils.mkdir_p(File.dirname(local_path))
|
84
88
|
|
85
|
-
return local_path if File.exist?(local_path)
|
86
|
-
|
87
89
|
temp_path = File.join(File.dirname(local_path), "~#{File.basename(local_path)}")
|
88
90
|
|
89
91
|
existing_size = File.exist?(temp_path) ? File.size(temp_path) : 0
|
data/lib/rllama/model.rb
CHANGED
@@ -47,11 +47,32 @@ module Rllama
|
|
47
47
|
alias message generate
|
48
48
|
|
49
49
|
def embed(prompt, normalize: true, batch_size: 512, &block)
|
50
|
-
|
51
|
-
|
50
|
+
inputs = prompt.is_a?(Array) ? prompt : [prompt]
|
51
|
+
|
52
|
+
tokenized_inputs = inputs.map { |text| tokenize(text, max_tokens: n_ctx_train) }
|
53
|
+
max_token_length = tokenized_inputs.map(&:length).max || 0
|
54
|
+
|
55
|
+
effective_batch_size = [batch_size, max_token_length].max
|
56
|
+
effective_ctx = [n_ctx_train, max_token_length].min
|
57
|
+
|
58
|
+
init_embedding_context(n_ctx: effective_ctx, n_batch: effective_batch_size) do |ctx|
|
59
|
+
inputs = prompt.is_a?(Array) ? tokenized_inputs : tokenized_inputs[0]
|
60
|
+
|
61
|
+
ctx.embed(inputs, normalize:, batch_size: effective_batch_size, &block)
|
52
62
|
end
|
53
63
|
end
|
54
64
|
|
65
|
+
def tokenize(text, max_tokens: nil)
|
66
|
+
size = text.bytesize + 2
|
67
|
+
|
68
|
+
tokens_ptr = FFI::MemoryPointer.new(:int32, size)
|
69
|
+
count = Cpp.llama_tokenize(vocab, text, text.bytesize, tokens_ptr, size, true, false)
|
70
|
+
|
71
|
+
raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
|
72
|
+
|
73
|
+
tokens_ptr.read_array_of_int32([count, max_tokens].compact.min)
|
74
|
+
end
|
75
|
+
|
55
76
|
def close
|
56
77
|
Cpp.llama_model_free(@pointer)
|
57
78
|
end
|
@@ -70,7 +91,7 @@ module Rllama
|
|
70
91
|
context
|
71
92
|
end
|
72
93
|
|
73
|
-
def init_embedding_context(n_ctx:
|
94
|
+
def init_embedding_context(n_ctx: n_ctx_train, n_batch: 512, &)
|
74
95
|
init_context(embeddings: true, n_ctx:, n_batch:, &)
|
75
96
|
end
|
76
97
|
|
data/lib/rllama/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rllama
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: x64-mingw-ucrt
|
6
6
|
authors:
|
7
7
|
- Pete Matsyburka
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-10-
|
10
|
+
date: 2025-10-07 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: ffi
|