rllama 1.0.0-arm64-darwin → 1.0.2-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11d7ddcc0931dc67db511e81488bc3a2a19d564c3fdffbcdae938a9cf77fbd74
4
- data.tar.gz: 157cb3a06a1469788ddffd31466ab363114e747587befa59583efd37e3b8796f
3
+ metadata.gz: ad943f40faafeaf0a622fc29006a1e70a8b650996356b8f12ac07a8fab689358
4
+ data.tar.gz: 5de4cfd00f49df0b1b8025417c84d7717a41c6d60c5921a706e6cc02a0a3fcb7
5
5
  SHA512:
6
- metadata.gz: 40c0670a3b6aa77aefedc09e8763e9175f1be7074b243dc682763389ba6187c522650cb1b5ea54f3921b129733f30d1512acfe5f8793e31ff7747ad1ec41a4e2
7
- data.tar.gz: 1c81121425b454ce3803a650754a9ba2882a4254c4f414d98c30e707897965493ef1f2f3288fbcbae363af11259cefe8336ebc5cec94c0fdf6a6e1cf86d0d93a
6
+ metadata.gz: 799e088b716958ad4363835104b99ef3edfa423a12220e07df92f1ff81a2ede03e5151d7c242ef39cc3d0e151688722b900bb35b974d225dcb6d4450994664b2
7
+ data.tar.gz: 550298eb56c7f451354e3d620b729c247c2b2d97bf0f9f29220bb6365ba15134b46dbf65062f6df370118e5f21cebcdd974eae6e4ea7ec9528a0ab8e34813593
data/README.md CHANGED
@@ -1,3 +1,5 @@
1
+ <img width="336" height="212.0" alt="Logo" src="https://github.com/user-attachments/assets/e27442fb-22d1-44cf-ba3d-f10b24c13652" />
2
+
1
3
  # Rllama
2
4
 
3
5
  Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
@@ -22,6 +24,38 @@ Or install it yourself as:
22
24
  gem install rllama
23
25
  ```
24
26
 
27
+ ## CLI Chat
28
+
29
+ The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
30
+
31
+ ```bash
32
+ rllama
33
+ ```
34
+
35
+ When you run `rllama` without arguments, it will display:
36
+
37
+ - **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
38
+ - **Popular models**: A curated list of popular models available for download, including:
39
+ - Gemma 3 1B
40
+ - Llama 3.2 3B
41
+ - Phi-4
42
+ - Qwen3 30B
43
+ - GPT-OSS
44
+
45
+ Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
46
+
47
+ You can also specify a model path or URL directly:
48
+
49
+ ```bash
50
+ rllama path/to/your/model.gguf
51
+ ```
52
+
53
+ ```bash
54
+ rllama https://huggingface.co/microsoft/phi-4-gguf/resolve/main/phi-4-Q3_K_S.gguf
55
+ ```
56
+
57
+ Once the model has loaded, you can start chatting.
58
+
25
59
  ## Usage
26
60
 
27
61
  ### Text Generation
@@ -162,43 +196,8 @@ By default, embedding vectors are normalized. You can disable normalization with
162
196
  ```ruby
163
197
  # Generate unnormalized embeddings
164
198
  embedding = model.embed('Sample text', normalize: false)
165
-
166
- # Use custom batch size for processing multiple texts
167
- embeddings = model.embed(
168
- ['roses are red', 'violets are blue', 'sugar is sweet'],
169
- normalize: true
170
- )
171
- ```
172
-
173
- ## CLI Chat Utility
174
-
175
- The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
176
-
177
- ```bash
178
- rllama
179
- ```
180
-
181
- When you run `rllama` without arguments, it will display:
182
-
183
- - **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
184
- - **Popular models**: A curated list of popular models available for download, including:
185
- - Gemma 3 1B
186
- - Llama 3.2 3B
187
- - Phi-4
188
- - Qwen3 30B
189
- - GPT-OSS
190
- - And more...
191
-
192
- Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
193
-
194
- You can also specify a model path directly:
195
-
196
- ```bash
197
- rllama path/to/your/model.gguf
198
199
  ```
199
200
 
200
- Once the model loads, you can start chatting.
201
-
202
201
  ## Finding Models
203
202
 
204
203
  You can download GGUF format models from various sources:
@@ -12,12 +12,21 @@ module Rllama
12
12
 
13
13
  @ctx_params = Cpp.llama_context_default_params
14
14
 
15
- @ctx_params[:n_ctx] = @n_ctx
16
- @ctx_params[:n_batch] = @n_batch
15
+ @ctx_params[:n_ctx] = @n_ctx if @n_ctx
16
+ @ctx_params[:n_batch] = @n_batch if @n_batch
17
17
 
18
18
  if @embeddings
19
- @ctx_params[:n_seq_max] = [@n_batch, @model.n_seq_max].min
19
+ seq_cap = @model.n_seq_max
20
+
21
+ if @n_batch&.positive? && seq_cap&.positive?
22
+ @ctx_params[:n_seq_max] = [@n_batch, seq_cap].min
23
+ elsif seq_cap&.positive?
24
+ @ctx_params[:n_seq_max] = seq_cap
25
+ end
26
+
20
27
  @ctx_params[:embeddings] = true
28
+ @ctx_params[:kv_unified] = true
29
+ @ctx_params[:n_ubatch] = @n_batch if @n_batch&.positive?
21
30
  end
22
31
 
23
32
  @pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
@@ -141,19 +150,31 @@ module Rllama
141
150
  end
142
151
  alias message generate
143
152
 
144
- def embed(strings, normalize: true, batch_size: 512)
145
- is_array = strings.is_a?(Array)
153
+ def embed(strings_or_tokens, normalize: true, batch_size: 512)
154
+ is_tokens = strings_or_tokens.is_a?(Array) &&
155
+ (strings_or_tokens[0].is_a?(Integer) ||
156
+ (strings_or_tokens[0].is_a?(Array) && strings_or_tokens[0][0].is_a?(Integer)))
146
157
 
147
- strings = Array(strings) unless is_array
158
+ input_is_array = is_tokens ? strings_or_tokens[0].is_a?(Array) : strings_or_tokens.is_a?(Array)
148
159
 
149
- tokenized_strings = strings.map do |text|
150
- max_tokens = text.bytesize + 2
151
- tokens_ptr = FFI::MemoryPointer.new(:int32, max_tokens)
152
- count = Cpp.llama_tokenize(@model.vocab, text, text.bytesize, tokens_ptr, max_tokens, true, false)
160
+ normalized_inputs = input_is_array ? strings_or_tokens : [strings_or_tokens]
161
+
162
+ tokenized_strings =
163
+ if is_tokens
164
+ input_is_array ? strings_or_tokens : [strings_or_tokens]
165
+ else
166
+ normalized_inputs.map { |text| @model.tokenize(text) }
167
+ end
153
168
 
154
- raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
169
+ max_tokens_in_prompt = tokenized_strings.map(&:length).max || 0
155
170
 
156
- tokens_ptr.read_array_of_int32(count)
171
+ if max_tokens_in_prompt > batch_size
172
+ raise Error, "batch_size (#{batch_size}) is smaller than the longest prompt (#{max_tokens_in_prompt} tokens)."
173
+ end
174
+
175
+ if max_tokens_in_prompt > @n_batch
176
+ raise Error, "Context n_batch (#{@n_batch}) is smaller than the longest " \
177
+ "prompt (#{max_tokens_in_prompt} tokens). Increase batch_size when calling embed."
157
178
  end
158
179
 
159
180
  all_embeddings = []
@@ -166,6 +187,9 @@ module Rllama
166
187
 
167
188
  batch[:n_tokens] = current_batch_token_count
168
189
 
190
+ memory_ptr = Cpp.llama_get_memory(@pointer)
191
+ Cpp.llama_memory_clear(memory_ptr, true) unless memory_ptr.null?
192
+
169
193
  raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
170
194
 
171
195
  prompts_in_batch.each do |seq_id_in_batch|
@@ -179,7 +203,8 @@ module Rllama
179
203
  end
180
204
 
181
205
  current_batch_token_count = 0
182
- prompts_in_batch = []
206
+
207
+ prompts_in_batch.clear
183
208
  end
184
209
 
185
210
  tokenized_strings.each do |tokens|
@@ -207,7 +232,7 @@ module Rllama
207
232
 
208
233
  Cpp.llama_batch_free(batch)
209
234
 
210
- is_array ? all_embeddings : all_embeddings[0]
235
+ input_is_array ? all_embeddings : all_embeddings[0]
211
236
  end
212
237
 
213
238
  def embeddings?
data/lib/rllama/cpp.rb CHANGED
@@ -8,7 +8,7 @@ module Rllama
8
8
 
9
9
  LIB_NAME = 'llama'
10
10
 
11
- platform =
11
+ PLATFORM =
12
12
  case FFI::Platform::OS
13
13
  when 'darwin'
14
14
  FFI::Platform::ARCH == 'aarch64' ? 'arm64-darwin' : 'x86_64-darwin'
@@ -28,12 +28,27 @@ module Rllama
28
28
  "lib#{LIB_NAME}.so"
29
29
  end
30
30
 
31
- platform_dir = File.join(__dir__, platform)
32
- platform_path = File.join(platform_dir, lib_file)
31
+ PLATFORM_DIR = File.join(__dir__, PLATFORM)
32
+
33
+ platform_path = File.join(PLATFORM_DIR, lib_file)
33
34
 
34
35
  lib_paths = []
36
+
35
37
  lib_paths << platform_path if File.exist?(platform_path)
36
38
 
39
+ ggml_lib_file =
40
+ case FFI::Platform::OS
41
+ when 'darwin'
42
+ 'libggml.dylib'
43
+ when 'windows', 'mingw32'
44
+ 'ggml.dll'
45
+ else
46
+ 'libggml.so'
47
+ end
48
+
49
+ ggml_platform_path = File.join(PLATFORM_DIR, ggml_lib_file)
50
+ lib_paths << ggml_platform_path if File.exist?(ggml_platform_path)
51
+
37
52
  lib_paths +=
38
53
  case FFI::Platform::OS
39
54
  when 'darwin'
@@ -436,6 +451,8 @@ module Rllama
436
451
  attach_function :llama_backend_init, [], :void
437
452
  attach_function :llama_backend_free, [], :void
438
453
  attach_function :llama_numa_init, [:int], :void # ggml_numa_strategy
454
+ attach_function :ggml_backend_load_all, [], :void
455
+ attach_function :ggml_backend_load_all_from_path, [:string], :void
439
456
 
440
457
  # Threadpool
441
458
  attach_function :llama_attach_threadpool, %i[llama_context_p ggml_threadpool_t ggml_threadpool_t], :void
@@ -681,10 +698,14 @@ module Rllama
681
698
  llama_log_set(@log_callback, nil)
682
699
  end
683
700
 
684
- llama_backend_init
685
-
686
701
  silence_log!
687
702
 
703
+ if File.directory?(PLATFORM_DIR)
704
+ ggml_backend_load_all_from_path(PLATFORM_DIR)
705
+ else
706
+ ggml_backend_load_all
707
+ end
708
+
688
709
  freeze
689
710
  end
690
711
  end
data/lib/rllama/loader.rb CHANGED
@@ -62,6 +62,8 @@ module Rllama
62
62
 
63
63
  local_path = File.join(dir, org, repo, file_path)
64
64
 
65
+ return local_path if File.exist?(local_path)
66
+
65
67
  puts "Destination: #{local_path}"
66
68
 
67
69
  download_file(url, local_path, "HuggingFace model: #{hf_path}")
@@ -74,6 +76,8 @@ module Rllama
74
76
 
75
77
  local_path = File.join(dir, filename)
76
78
 
79
+ return local_path if File.exist?(local_path)
80
+
77
81
  puts "Destination: #{local_path}"
78
82
 
79
83
  download_file(url, local_path, "URL: #{url}")
@@ -82,8 +86,6 @@ module Rllama
82
86
  def download_file(url, local_path, description)
83
87
  FileUtils.mkdir_p(File.dirname(local_path))
84
88
 
85
- return local_path if File.exist?(local_path)
86
-
87
89
  temp_path = File.join(File.dirname(local_path), "~#{File.basename(local_path)}")
88
90
 
89
91
  existing_size = File.exist?(temp_path) ? File.size(temp_path) : 0
data/lib/rllama/model.rb CHANGED
@@ -47,11 +47,32 @@ module Rllama
47
47
  alias message generate
48
48
 
49
49
  def embed(prompt, normalize: true, batch_size: 512, &block)
50
- init_embedding_context do |ctx|
51
- ctx.embed(prompt, normalize:, batch_size:, &block)
50
+ inputs = prompt.is_a?(Array) ? prompt : [prompt]
51
+
52
+ tokenized_inputs = inputs.map { |text| tokenize(text, max_tokens: n_ctx_train) }
53
+ max_token_length = tokenized_inputs.map(&:length).max || 0
54
+
55
+ effective_batch_size = [batch_size, max_token_length].max
56
+ effective_ctx = [n_ctx_train, max_token_length].min
57
+
58
+ init_embedding_context(n_ctx: effective_ctx, n_batch: effective_batch_size) do |ctx|
59
+ inputs = prompt.is_a?(Array) ? tokenized_inputs : tokenized_inputs[0]
60
+
61
+ ctx.embed(inputs, normalize:, batch_size: effective_batch_size, &block)
52
62
  end
53
63
  end
54
64
 
65
+ def tokenize(text, max_tokens: nil)
66
+ size = text.bytesize + 2
67
+
68
+ tokens_ptr = FFI::MemoryPointer.new(:int32, size)
69
+ count = Cpp.llama_tokenize(vocab, text, text.bytesize, tokens_ptr, size, true, false)
70
+
71
+ raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
72
+
73
+ tokens_ptr.read_array_of_int32([count, max_tokens].compact.min)
74
+ end
75
+
55
76
  def close
56
77
  Cpp.llama_model_free(@pointer)
57
78
  end
@@ -70,7 +91,7 @@ module Rllama
70
91
  context
71
92
  end
72
93
 
73
- def init_embedding_context(n_ctx: 2048, n_batch: 512, &)
94
+ def init_embedding_context(n_ctx: n_ctx_train, n_batch: 512, &)
74
95
  init_context(embeddings: true, n_ctx:, n_batch:, &)
75
96
  end
76
97
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Rllama
4
- VERSION = '1.0.0'
4
+ VERSION = '1.0.2'
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rllama
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.2
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Pete Matsyburka
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-10-05 00:00:00.000000000 Z
10
+ date: 2025-10-07 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: ffi