rllama 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +14 -4
- data/bin/rllama +0 -2
- data/lib/rllama/cli.rb +11 -5
- data/lib/rllama/context.rb +6 -1
- data/lib/rllama/cpp.rb +10 -6
- data/lib/rllama/model.rb +50 -2
- data/lib/rllama/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e5b617098c5919065ccd3ae3d9da4ed1c068f121415ec06caea749b50dcc17eb
|
|
4
|
+
data.tar.gz: 27da8c412f1327e425834ab6429d2d0de7131d8ea14e7a7f3c72629ae75b457a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cf4c1df062c79fa36b1a5a641513ee8a9b960c79b8b077dc25d97931c5d630b618c8dac950845ccad394fb4f60814ec756255d3a2f1847db68ae622719fecb30
|
|
7
|
+
data.tar.gz: '03000528b45aa265d6acc8696eb495b5c9ac57b143cba0e8ab3fc333ce0cf824b8ba86f77d1ab38522080e62e7887a934f2543d1d16f8e6e3c333f48a0ff9585'
|
data/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# Rllama
|
|
4
4
|
|
|
5
|
-
Ruby bindings for [llama.cpp](https://github.com/
|
|
5
|
+
Ruby bindings for [llama.cpp](https://github.com/ggml-org/llama.cpp) to run open-source language models locally. Run models like Gemma 4, Qwen 3.5, GLM 4.7, Nemotron, LFM2, Llama 3, and many others directly in your Ruby application code.
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
@@ -24,6 +24,12 @@ Or install it yourself as:
|
|
|
24
24
|
gem install rllama
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
+
### Troubleshooting
|
|
28
|
+
|
|
29
|
+
#### `llama_model_load_from_file_impl: no backends are loaded`
|
|
30
|
+
|
|
31
|
+
If you're running on an Intel (x86_64) CPU and encounter this error while loading a model, make sure you're using the latest version of the gem. Rllama now preloads the bundled GGML backend libraries and automatically augments the `GGML_BACKEND_PATH`, so upgrading resolves the issue without any manual steps. If you build from source, ensure that directory is included in `GGML_BACKEND_PATH` before booting your Ruby process.
|
|
32
|
+
|
|
27
33
|
## CLI Chat
|
|
28
34
|
|
|
29
35
|
The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
|
|
@@ -36,11 +42,14 @@ When you run `rllama` without arguments, it will display:
|
|
|
36
42
|
|
|
37
43
|
- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
|
|
38
44
|
- **Popular models**: A curated list of popular models available for download, including:
|
|
39
|
-
- Gemma
|
|
45
|
+
- Gemma 4 E4B / Gemma 4 26B-A4B
|
|
46
|
+
- Nemotron 3 Nano 4B
|
|
47
|
+
- Qwen 3.5 35B-A3B
|
|
48
|
+
- LFM2 24B-A2B
|
|
49
|
+
- GLM 4.7 Flash
|
|
50
|
+
- GPT-OSS 20B
|
|
40
51
|
- Llama 3.2 3B
|
|
41
52
|
- Phi-4
|
|
42
|
-
- Qwen3 30B
|
|
43
|
-
- GPT-OSS
|
|
44
53
|
|
|
45
54
|
Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
|
|
46
55
|
|
|
@@ -204,6 +213,7 @@ You can download GGUF format models from various sources:
|
|
|
204
213
|
|
|
205
214
|
- [Hugging Face](https://huggingface.co/models?library=gguf) - Search for models with "GGUF" format
|
|
206
215
|
|
|
216
|
+
|
|
207
217
|
## License
|
|
208
218
|
|
|
209
219
|
MIT
|
data/bin/rllama
CHANGED
data/lib/rllama/cli.rb
CHANGED
|
@@ -5,13 +5,19 @@ require 'readline'
|
|
|
5
5
|
module Rllama
|
|
6
6
|
class Cli
|
|
7
7
|
POPULAR_MODELS = [
|
|
8
|
-
{ path: 'lmstudio-community/gemma-
|
|
9
|
-
{ path: 'lmstudio-community/
|
|
10
|
-
{ path: '
|
|
11
|
-
{ path: 'unsloth/Qwen3-
|
|
8
|
+
{ path: 'lmstudio-community/gemma-4-E4B-it-GGUF/gemma-4-E4B-it-Q4_K_M.gguf', size: 5_335_285_280 },
|
|
9
|
+
{ path: 'lmstudio-community/gemma-4-26B-A4B-it-GGUF/gemma-4-26B-A4B-it-Q4_K_M.gguf', size: 16_796_010_624 },
|
|
10
|
+
{ path: 'unsloth/NVIDIA-Nemotron-3-Nano-4B-GGUF/NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf', size: 2_900_295_712 },
|
|
11
|
+
{ path: 'unsloth/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q4_K_M.gguf', size: 22_016_023_168 },
|
|
12
|
+
{ path: 'lmstudio-community/LFM2-24B-A2B-GGUF/LFM2-24B-A2B-Q4_K_M.gguf', size: 14_415_473_952 },
|
|
13
|
+
{ path: 'unsloth/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-Q4_K_M.gguf', size: 18_312_339_808 },
|
|
12
14
|
{ path: 'inclusionAI/Ling-mini-2.0-GGUF/Ling-mini-2.0-Q4_K_M.gguf', size: 9_911_575_072 },
|
|
15
|
+
{ path: 'lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf', size: 12_109_565_632 },
|
|
13
16
|
{ path: 'unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-Q4_K_S.gguf', size: 4_404_697_216 },
|
|
14
|
-
{ path: '
|
|
17
|
+
{ path: 'unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-Q3_K_S.gguf', size: 13_292_468_800 },
|
|
18
|
+
{ path: 'lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf', size: 720_425_472 },
|
|
19
|
+
{ path: 'microsoft/phi-4-gguf/phi-4-Q4_K_S.gguf', size: 8_440_762_560 },
|
|
20
|
+
{ path: 'bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf', size: 2_019_377_696 }
|
|
15
21
|
].freeze
|
|
16
22
|
|
|
17
23
|
COLOR_CODES = {
|
data/lib/rllama/context.rb
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'etc'
|
|
4
|
+
|
|
3
5
|
module Rllama
|
|
4
6
|
class Context
|
|
5
7
|
attr_reader :messages, :n_ctx, :n_batch, :n_past
|
|
6
8
|
|
|
7
|
-
def initialize(model, embeddings: false, n_ctx: nil, n_batch: nil)
|
|
9
|
+
def initialize(model, embeddings: false, n_ctx: nil, n_batch: nil, n_threads: Etc.nprocessors)
|
|
8
10
|
@model = model
|
|
9
11
|
@n_ctx = n_ctx
|
|
10
12
|
@n_batch = n_batch
|
|
@@ -15,6 +17,9 @@ module Rllama
|
|
|
15
17
|
@ctx_params[:n_ctx] = @n_ctx if @n_ctx
|
|
16
18
|
@ctx_params[:n_batch] = @n_batch if @n_batch
|
|
17
19
|
|
|
20
|
+
@ctx_params[:n_threads] = n_threads
|
|
21
|
+
@ctx_params[:n_threads_batch] = n_threads
|
|
22
|
+
|
|
18
23
|
if @embeddings
|
|
19
24
|
seq_cap = @model.n_seq_max
|
|
20
25
|
|
data/lib/rllama/cpp.rb
CHANGED
|
@@ -15,7 +15,11 @@ module Rllama
|
|
|
15
15
|
when 'windows', 'mingw32'
|
|
16
16
|
'x64-mingw32'
|
|
17
17
|
else
|
|
18
|
-
FFI::Platform::ARCH == 'aarch64' ? 'aarch64
|
|
18
|
+
arch = FFI::Platform::ARCH == 'aarch64' ? 'aarch64' : 'x86_64'
|
|
19
|
+
|
|
20
|
+
is_musl = defined?(FFI::Platform::IS_GNU) ? !FFI::Platform::IS_GNU : RbConfig::CONFIG['host_os'].include?('musl')
|
|
21
|
+
|
|
22
|
+
is_musl ? "#{arch}-linux-musl" : "#{arch}-linux"
|
|
19
23
|
end
|
|
20
24
|
|
|
21
25
|
lib_file =
|
|
@@ -359,7 +363,9 @@ module Rllama
|
|
|
359
363
|
:no_perf, :bool,
|
|
360
364
|
:op_offload, :bool,
|
|
361
365
|
:swa_full, :bool,
|
|
362
|
-
:kv_unified, :bool
|
|
366
|
+
:kv_unified, :bool,
|
|
367
|
+
:samplers, :pointer,
|
|
368
|
+
:n_samplers, :size_t
|
|
363
369
|
end
|
|
364
370
|
|
|
365
371
|
class LlamaModelQuantizeParams < FFI::Struct
|
|
@@ -533,10 +539,8 @@ module Rllama
|
|
|
533
539
|
attach_function :llama_adapter_lora_free, [:llama_adapter_lora_p], :void
|
|
534
540
|
attach_function :llama_adapter_get_alora_n_invocation_tokens, [:llama_adapter_lora_p], :uint64
|
|
535
541
|
attach_function :llama_adapter_get_alora_invocation_tokens, [:llama_adapter_lora_p], :pointer # const llama_token*
|
|
536
|
-
attach_function :
|
|
537
|
-
attach_function :
|
|
538
|
-
attach_function :llama_clear_adapter_lora, [:llama_context_p], :void
|
|
539
|
-
attach_function :llama_apply_adapter_cvec, %i[llama_context_p pointer size_t int32 int32 int32], :int32
|
|
542
|
+
attach_function :llama_set_adapters_lora, %i[llama_context_p pointer size_t pointer], :int32
|
|
543
|
+
attach_function :llama_set_adapter_cvec, %i[llama_context_p pointer size_t int32 int32 int32], :int32
|
|
540
544
|
|
|
541
545
|
# Memory management
|
|
542
546
|
attach_function :llama_memory_clear, %i[llama_memory_t bool], :void
|
data/lib/rllama/model.rb
CHANGED
|
@@ -4,6 +4,16 @@ module Rllama
|
|
|
4
4
|
class Model
|
|
5
5
|
DEFAULT_CONTEXT_LENGTH = 2**13
|
|
6
6
|
|
|
7
|
+
FALLBACK_TEMPLATES = {
|
|
8
|
+
'gemma4' => {
|
|
9
|
+
bos: '<bos>',
|
|
10
|
+
role_map: { 'assistant' => 'model' },
|
|
11
|
+
turn_start: ->(role) { "<|turn>#{role}\n" },
|
|
12
|
+
turn_end: "<turn|>\n",
|
|
13
|
+
generation_prompt: "<|turn>model\n"
|
|
14
|
+
}
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
7
17
|
attr_reader :pointer
|
|
8
18
|
|
|
9
19
|
def initialize(path_or_name, dir: nil)
|
|
@@ -36,6 +46,16 @@ module Rllama
|
|
|
36
46
|
@n_ctx_train ||= Cpp.llama_model_n_ctx_train(@pointer)
|
|
37
47
|
end
|
|
38
48
|
|
|
49
|
+
def architecture
|
|
50
|
+
@architecture ||= begin
|
|
51
|
+
buf = FFI::MemoryPointer.new(:char, 256)
|
|
52
|
+
|
|
53
|
+
n = Cpp.llama_model_meta_val_str(@pointer, 'general.architecture', buf, 256)
|
|
54
|
+
|
|
55
|
+
n.positive? ? buf.read_string(n) : nil
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
39
59
|
def generate(prompt, max_tokens: DEFAULT_CONTEXT_LENGTH, temperature: 0.8, top_k: 40, top_p: 0.95, min_p: 0.05,
|
|
40
60
|
seed: nil, system: nil, &block)
|
|
41
61
|
init_context(n_ctx: max_tokens) do |ctx|
|
|
@@ -98,6 +118,14 @@ module Rllama
|
|
|
98
118
|
def build_chat_template(messages)
|
|
99
119
|
raise Error, 'Model does not provide a chat template' if chat_template.nil? || chat_template.empty?
|
|
100
120
|
|
|
121
|
+
result = apply_chat_template(messages)
|
|
122
|
+
|
|
123
|
+
return result if result
|
|
124
|
+
|
|
125
|
+
apply_chat_template_fallback(messages)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def apply_chat_template(messages)
|
|
101
129
|
count = messages.length
|
|
102
130
|
struct_size = Cpp::LlamaChatMessage.size
|
|
103
131
|
array_ptr = FFI::MemoryPointer.new(struct_size * count)
|
|
@@ -111,14 +139,34 @@ module Rllama
|
|
|
111
139
|
|
|
112
140
|
needed = Cpp.llama_chat_apply_template(chat_template, array_ptr, count, true, nil, 0)
|
|
113
141
|
|
|
114
|
-
|
|
142
|
+
return nil if needed.negative?
|
|
115
143
|
|
|
116
144
|
buf = FFI::MemoryPointer.new(:char, needed)
|
|
117
145
|
written = Cpp.llama_chat_apply_template(chat_template, array_ptr, count, true, buf, needed)
|
|
118
146
|
|
|
119
|
-
|
|
147
|
+
return nil if written.negative?
|
|
120
148
|
|
|
121
149
|
buf.read_string(written)
|
|
122
150
|
end
|
|
151
|
+
|
|
152
|
+
def apply_chat_template_fallback(messages)
|
|
153
|
+
tmpl = FALLBACK_TEMPLATES[architecture]
|
|
154
|
+
|
|
155
|
+
raise Error, "Unsupported chat template for architecture: #{architecture || 'unknown'}" unless tmpl
|
|
156
|
+
|
|
157
|
+
result = String.new(tmpl[:bos] || '')
|
|
158
|
+
role_map = tmpl[:role_map] || {}
|
|
159
|
+
|
|
160
|
+
messages.each do |m|
|
|
161
|
+
role = role_map[m[:role].to_s] || m[:role].to_s
|
|
162
|
+
result << tmpl[:turn_start].call(role)
|
|
163
|
+
result << m[:content].to_s
|
|
164
|
+
result << tmpl[:turn_end]
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
result << tmpl[:generation_prompt]
|
|
168
|
+
|
|
169
|
+
result
|
|
170
|
+
end
|
|
123
171
|
end
|
|
124
172
|
end
|
data/lib/rllama/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rllama
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0
|
|
4
|
+
version: 1.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Pete Matsyburka
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: ffi
|
|
@@ -61,7 +61,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
61
61
|
- !ruby/object:Gem::Version
|
|
62
62
|
version: '0'
|
|
63
63
|
requirements: []
|
|
64
|
-
rubygems_version:
|
|
64
|
+
rubygems_version: 4.0.3
|
|
65
65
|
specification_version: 4
|
|
66
|
-
summary: Ruby bindings for
|
|
66
|
+
summary: Ruby bindings for llama.cpp to run local LLMs with Ruby.
|
|
67
67
|
test_files: []
|