rllama 1.0.0-x64-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f240a5056f01a0697574a0b8d58fc8c9dc8536539b9aea2b4eb8090fcdc205bd
4
+ data.tar.gz: c54446e1a2e6e00e76a17b1cda48ed73718036778b6e83b1882e0fadaa36d026
5
+ SHA512:
6
+ metadata.gz: fcf72b43c0ae67c19a1b78587450e45150b1bf164c975376db5c57de2e94979752f23ab5663eebe9df70c7a0b771474847c616fdc65e954cbb6ad4725e5b9f98
7
+ data.tar.gz: 3dfe1e131782c44bcc229ae8753b1c6d8ca83fc4f33929a33e33385a7328643208550911512f0868fed150ff24d6d4bf932fbdd338e00262990200ed6225c493
data/README.md ADDED
@@ -0,0 +1,214 @@
1
+ # Rllama
2
+
3
+ Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'rllama'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ ```bash
16
+ bundle install
17
+ ```
18
+
19
+ Or install it yourself as:
20
+
21
+ ```bash
22
+ gem install rllama
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ### Text Generation
28
+
29
+ Generate text completions using local language models:
30
+
31
+ ```ruby
32
+ require 'rllama'
33
+
34
+ # Load a model
35
+ model = Rllama.load_model('lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf')
36
+
37
+ # Generate text
38
+ result = model.generate('What is the capital of France?')
39
+ puts result.text
40
+ # => "The capital of France is Paris."
41
+
42
+ # Access generation statistics
43
+ puts "Tokens generated: #{result.stats[:tokens_generated]}"
44
+ puts "Tokens per second: #{result.stats[:tps]}"
45
+ puts "Duration: #{result.stats[:duration]} seconds"
46
+
47
+ # Don't forget to close the model when done
48
+ model.close
49
+ ```
50
+
51
+ #### Generation parameters
52
+
53
+ Adjust the generation with parameters:
54
+
55
+ ```ruby
56
+ result = model.generate(
57
+ 'Write a short poem about Ruby programming',
58
+ max_tokens: 2024,
59
+ temperature: 0.8,
60
+ top_k: 40,
61
+ top_p: 0.95,
62
+ min_p: 0.05
63
+ )
64
+ ```
65
+
66
+ #### Streaming generation
67
+
68
+ Stream generated text token-by-token:
69
+
70
+ ```ruby
71
+ model.generate('Explain quantum computing') do |token|
72
+ print token
73
+ end
74
+ ```
75
+
76
+ #### System prompt
77
+
78
+ Include system promt to guide model behavior:
79
+
80
+ ```ruby
81
+ result = model.generate(
82
+ 'What are best practices for Ruby development?',
83
+ system: 'You are an expert Ruby developer with 10 years of experience.'
84
+ )
85
+ ```
86
+
87
+ #### Messages list
88
+
89
+ Pass multiple messages with roles for more complex interactions:
90
+
91
+ ```ruby
92
+ result = model.generate([
93
+ { role: 'system', content: 'You are a helpful assistant.' },
94
+ { role: 'user', content: 'What is the capital of France?' },
95
+ { role: 'assistant', content: 'The capital of France is Paris.' },
96
+ { role: 'user', content: 'What is its population?' }
97
+ ])
98
+ puts result.text
99
+ ```
100
+
101
+ ### Chat
102
+
103
+ For ongoing conversations, use a context object that maintains the conversation history:
104
+
105
+ ```ruby
106
+ # Initialize a chat context
107
+ context = model.init_context
108
+
109
+ # Send messages and maintain conversation history
110
+ response1 = context.message('What is the capital of France?')
111
+ puts response1.text
112
+ # => "The capital of France is Paris."
113
+
114
+ response2 = context.message('What is the population of that city?')
115
+ puts response2.text
116
+ # => "Paris has a population of approximately 2.1 million people..."
117
+
118
+ response3 = context.message('What was my first message?')
119
+ puts response3.text
120
+ # => "Your first message was asking about the capital of France."
121
+
122
+ # The context remembers all previous messages in the conversation
123
+
124
+ # Close context when done
125
+ context.close
126
+ ```
127
+
128
+ ### Embeddings
129
+
130
+ Generate vector embeddings for text using embedding models:
131
+
132
+ ```ruby
133
+ require 'rllama'
134
+
135
+ # Load an embedding model
136
+ model = Rllama.load_model('lmstudio-community/embeddinggemma-300m-qat-GGUF/embeddinggemma-300m-qat-Q4_0.gguf')
137
+
138
+ # Generate embedding for a single text
139
+ embedding = model.embed('Hello, world!')
140
+ puts embedding.length
141
+ # => 724 (depending on your model)
142
+
143
+ # Generate embeddings for multiple sentences
144
+ embeddings = model.embed([
145
+ 'roses are red',
146
+ 'violets are blue',
147
+ 'sugar is sweet'
148
+ ])
149
+
150
+ puts embeddings.length
151
+ # => 3
152
+ puts embeddings[0].length
153
+ # => 768
154
+
155
+ model.close
156
+ ```
157
+
158
+ #### Vector parameters
159
+
160
+ By default, embedding vectors are normalized. You can disable normalization with `normalize: false`:
161
+
162
+ ```ruby
163
+ # Generate unnormalized embeddings
164
+ embedding = model.embed('Sample text', normalize: false)
165
+
166
+ # Use custom batch size for processing multiple texts
167
+ embeddings = model.embed(
168
+ ['roses are red', 'violets are blue', 'sugar is sweet'],
169
+ normalize: true
170
+ )
171
+ ```
172
+
173
+ ## CLI Chat Utility
174
+
175
+ The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
176
+
177
+ ```bash
178
+ rllama
179
+ ```
180
+
181
+ When you run `rllama` without arguments, it will display:
182
+
183
+ - **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
184
+ - **Popular models**: A curated list of popular models available for download, including:
185
+ - Gemma 3 1B
186
+ - Llama 3.2 3B
187
+ - Phi-4
188
+ - Qwen3 30B
189
+ - GPT-OSS
190
+ - And more...
191
+
192
+ Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
193
+
194
+ You can also specify a model path directly:
195
+
196
+ ```bash
197
+ rllama path/to/your/model.gguf
198
+ ```
199
+
200
+ Once the model loads, you can start chatting.
201
+
202
+ ## Finding Models
203
+
204
+ You can download GGUF format models from various sources:
205
+
206
+ - [Hugging Face](https://huggingface.co/models?library=gguf) - Search for models with "GGUF" format
207
+
208
+ ## License
209
+
210
+ MIT
211
+
212
+ ## Contributing
213
+
214
+ Bug reports and pull requests are welcome on GitHub at https://github.com/docusealco/rllama.
data/bin/rllama ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'rllama'
6
+ require 'rllama/cli'
7
+
8
+ Rllama::Cli.start(ARGV)
data/lib/rllama/cli.rb ADDED
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'readline'
4
+
5
+ module Rllama
6
+ class Cli
7
+ POPULAR_MODELS = [
8
+ { path: 'lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf', size: 720_425_472 },
9
+ { path: 'lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf', size: 12_109_565_632 },
10
+ { path: 'bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf', size: 2_019_377_696 },
11
+ { path: 'unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-Q3_K_S.gguf', size: 13_292_468_800 },
12
+ { path: 'inclusionAI/Ling-mini-2.0-GGUF/Ling-mini-2.0-Q4_K_M.gguf', size: 9_911_575_072 },
13
+ { path: 'unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-Q4_K_S.gguf', size: 4_404_697_216 },
14
+ { path: 'microsoft/phi-4-gguf/phi-4-Q4_K_S.gguf', size: 8_440_762_560 }
15
+ ].freeze
16
+
17
+ COLOR_CODES = {
18
+ red: 31,
19
+ green: 32,
20
+ yellow: 33,
21
+ blue: 34,
22
+ magenta: 35,
23
+ cyan: 36
24
+ }.freeze
25
+
26
+ def self.start(args)
27
+ new(args).run
28
+ end
29
+
30
+ def initialize(args)
31
+ @args = args
32
+
33
+ @model_path = args.first
34
+ end
35
+
36
+ def run
37
+ model_path = select_or_load_model
38
+
39
+ puts "\n#{colorize('Loading model...', :yellow)}"
40
+
41
+ model = Rllama.load_model(model_path)
42
+ context = model.init_context
43
+
44
+ puts colorize('Model loaded successfully!', :green)
45
+ puts "\n#{colorize('Chat started. Type your message and press Enter. Type "exit" or "quit" to end the chat.',
46
+ :cyan)}\n\n"
47
+
48
+ chat_loop(context)
49
+ rescue Interrupt
50
+ puts "\n\n#{colorize('Chat interrupted. Goodbye!', :yellow)}"
51
+ exit(0)
52
+ rescue StandardError => e
53
+ puts "\n#{colorize("Error: #{e.message}", :red)}"
54
+ exit(1)
55
+ ensure
56
+ context&.close
57
+ model&.close
58
+ end
59
+
60
+ private
61
+
62
+ def select_or_load_model
63
+ return @model_path if @model_path
64
+
65
+ downloaded_models = find_downloaded_models
66
+ downloaded_model_names = downloaded_models.map { |path| File.basename(path, '.gguf') }
67
+
68
+ available_popular = POPULAR_MODELS.reject do |popular_model|
69
+ popular_filename = File.basename(popular_model[:path], '.gguf')
70
+ downloaded_model_names.any?(popular_filename)
71
+ end
72
+
73
+ all_choices = []
74
+ current_index = 1
75
+
76
+ unless downloaded_models.empty?
77
+ puts "#{colorize('Downloaded models:', :cyan)}\n\n"
78
+
79
+ downloaded_models.each do |model|
80
+ display_name = File.basename(model, '.gguf')
81
+ size = format_file_size(File.size(model))
82
+ puts " #{colorize(current_index.to_s, :green)}. #{display_name} #{colorize("(#{size})", :yellow)}"
83
+ all_choices << model
84
+ current_index += 1
85
+ end
86
+
87
+ puts "\n"
88
+ end
89
+
90
+ unless available_popular.empty?
91
+ puts "#{colorize('Popular models (not downloaded):', :cyan)}\n\n"
92
+
93
+ available_popular.each do |model|
94
+ display_name = File.basename(model[:path], '.gguf')
95
+ puts " #{colorize(current_index.to_s, :green)}. " \
96
+ "#{display_name} #{colorize("(#{format_file_size(model[:size])})", :yellow)}"
97
+ all_choices << model[:path]
98
+ current_index += 1
99
+ end
100
+
101
+ puts "\n"
102
+ end
103
+
104
+ if all_choices.empty?
105
+ puts colorize('No models available', :yellow)
106
+ exit(1)
107
+ end
108
+
109
+ print colorize("Enter number (1-#{all_choices.length}): ", :cyan)
110
+
111
+ choice = $stdin.gets&.strip.to_i
112
+
113
+ if choice < 1 || choice > all_choices.length
114
+ puts colorize('Invalid choice', :red)
115
+
116
+ exit(1)
117
+ end
118
+
119
+ all_choices[choice - 1]
120
+ end
121
+
122
+ def find_downloaded_models
123
+ models_dir = File.join(Dir.home, '.rllama', 'models')
124
+
125
+ return [] unless Dir.exist?(models_dir)
126
+
127
+ Dir.glob(File.join(models_dir, '**', '*.gguf')).reject do |path|
128
+ basename = File.basename(path)
129
+
130
+ basename.start_with?('~', '!')
131
+ end
132
+ end
133
+
134
+ def format_file_size(bytes)
135
+ gb = bytes / (1024.0**3)
136
+
137
+ if gb >= 1.0
138
+ format('%.1fGB', gb)
139
+ else
140
+ mb = bytes / (1024.0**2)
141
+
142
+ format('%dMB', mb.round)
143
+ end
144
+ end
145
+
146
+ def chat_loop(context)
147
+ loop do
148
+ user_input = Readline.readline('> ', false)&.strip
149
+
150
+ break if user_input.nil?
151
+
152
+ next if user_input.empty?
153
+
154
+ if user_input.downcase == 'exit' || user_input.downcase == 'quit'
155
+ puts "\n#{colorize('Goodbye!', :yellow)}"
156
+
157
+ break
158
+ end
159
+
160
+ puts "\n"
161
+
162
+ print "#{colorize('Assistant:', :magenta, bold: true)} "
163
+
164
+ context.generate(user_input) do |token|
165
+ print token
166
+ $stdout.flush
167
+ end
168
+
169
+ puts "\n\n"
170
+ end
171
+ end
172
+
173
+ def colorize(text, color, bold: false)
174
+ return text unless $stdout.tty?
175
+
176
+ code = COLOR_CODES[color] || 37
177
+
178
+ prefix = bold ? "\e[1;#{code}m" : "\e[#{code}m"
179
+
180
+ "#{prefix}#{text}\e[0m"
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rllama
4
+ class Context
5
+ attr_reader :messages, :n_ctx, :n_batch, :n_past
6
+
7
+ def initialize(model, embeddings: false, n_ctx: nil, n_batch: nil)
8
+ @model = model
9
+ @n_ctx = n_ctx
10
+ @n_batch = n_batch
11
+ @embeddings = embeddings
12
+
13
+ @ctx_params = Cpp.llama_context_default_params
14
+
15
+ @ctx_params[:n_ctx] = @n_ctx
16
+ @ctx_params[:n_batch] = @n_batch
17
+
18
+ if @embeddings
19
+ @ctx_params[:n_seq_max] = [@n_batch, @model.n_seq_max].min
20
+ @ctx_params[:embeddings] = true
21
+ end
22
+
23
+ @pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
24
+
25
+ raise Error, 'Failed to create the llama_context' if @pointer.null?
26
+
27
+ @n_past = 0
28
+ @messages = []
29
+ end
30
+
31
+ def generate(message, role: 'user', max_tokens: @n_ctx, temperature: 0.8, top_k: 40, top_p: 0.95, min_p: 0.05,
32
+ seed: nil, system: nil)
33
+ @messages << { role: 'system', content: system } if system && @messages.empty?
34
+
35
+ if message.is_a?(Array)
36
+ @messages.push(*message)
37
+ elsif message.is_a?(Hash)
38
+ @messages.push(message)
39
+ else
40
+ @messages << { role: role, content: message }
41
+ end
42
+
43
+ prompt_string = @model.build_chat_template(@messages)
44
+
45
+ n_prompt_tokens = -Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, nil, 0, true, true)
46
+
47
+ raise Error, 'Prompt is too long.' if n_prompt_tokens.negative?
48
+
49
+ prompt_tokens_ptr = FFI::MemoryPointer.new(:int32, n_prompt_tokens)
50
+ tokens_written = Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, prompt_tokens_ptr,
51
+ n_prompt_tokens, true, true)
52
+
53
+ raise Error, 'Failed to tokenize prompt.' if tokens_written.negative?
54
+
55
+ new_token_count = tokens_written - @n_past
56
+
57
+ if new_token_count.positive?
58
+ new_tokens_ptr = prompt_tokens_ptr + (@n_past * FFI.type_size(:int32))
59
+
60
+ batch = Cpp.llama_batch_get_one(new_tokens_ptr, new_token_count)
61
+
62
+ raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
63
+
64
+ @n_past = tokens_written
65
+ end
66
+
67
+ chain_params = Cpp.llama_sampler_chain_default_params
68
+ sampler_chain = Cpp.llama_sampler_chain_init(chain_params)
69
+
70
+ Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_min_p(min_p, 1)) if min_p
71
+ Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_k(top_k)) if top_k&.positive?
72
+ Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_p(top_p, 1)) if top_p && top_p < 1.0
73
+ if temperature&.positive?
74
+ Cpp.llama_sampler_chain_add(sampler_chain,
75
+ Cpp.llama_sampler_init_temp(temperature))
76
+ end
77
+
78
+ is_probabilistic = temperature&.positive? || top_k&.positive? || (top_p && top_p < 1.0) || !min_p.nil?
79
+ rng_seed = seed || (Random.new_seed & 0xFFFFFFFF)
80
+
81
+ if is_probabilistic
82
+ Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_dist(rng_seed))
83
+ else
84
+ Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_greedy)
85
+ end
86
+
87
+ n_decoded = 0
88
+
89
+ generated_text = ''.b
90
+
91
+ assistant_message = { role: 'assistant', content: generated_text }
92
+
93
+ @messages << assistant_message
94
+
95
+ start_time = Time.now
96
+
97
+ loop do
98
+ break if n_decoded >= max_tokens
99
+
100
+ new_token_id = Cpp.llama_sampler_sample(sampler_chain, @pointer, -1)
101
+
102
+ break if Cpp.llama_vocab_is_eog(@model.vocab, new_token_id)
103
+
104
+ buffer = FFI::MemoryPointer.new(:char, 256)
105
+ n_chars = Cpp.llama_token_to_piece(@model.vocab, new_token_id, buffer, buffer.size, 0, true)
106
+
107
+ if n_chars >= 0
108
+ piece_bytes = buffer.read_string(n_chars)
109
+ utf8_piece = piece_bytes.force_encoding(Encoding::UTF_8)
110
+ generated_text << utf8_piece
111
+ yield utf8_piece if block_given?
112
+ end
113
+
114
+ token_ptr = FFI::MemoryPointer.new(:int32, 1).put_int32(0, new_token_id)
115
+ batch = Cpp.llama_batch_get_one(token_ptr, 1)
116
+
117
+ raise Error, 'context length has been exceeded' if @n_past >= @n_ctx
118
+ raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
119
+
120
+ @n_past += 1
121
+ n_decoded += 1
122
+ end
123
+
124
+ end_time = Time.now
125
+
126
+ duration = end_time - start_time
127
+
128
+ tps = n_decoded.positive? && duration.positive? ? n_decoded / duration : 0
129
+
130
+ Cpp.llama_sampler_free(sampler_chain)
131
+
132
+ Result.new(
133
+ text: generated_text,
134
+ stats: {
135
+ duration:,
136
+ tokens_generated: n_decoded,
137
+ tps:,
138
+ seed: rng_seed
139
+ }
140
+ )
141
+ end
142
+ alias message generate
143
+
144
+ def embed(strings, normalize: true, batch_size: 512)
145
+ is_array = strings.is_a?(Array)
146
+
147
+ strings = Array(strings) unless is_array
148
+
149
+ tokenized_strings = strings.map do |text|
150
+ max_tokens = text.bytesize + 2
151
+ tokens_ptr = FFI::MemoryPointer.new(:int32, max_tokens)
152
+ count = Cpp.llama_tokenize(@model.vocab, text, text.bytesize, tokens_ptr, max_tokens, true, false)
153
+
154
+ raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
155
+
156
+ tokens_ptr.read_array_of_int32(count)
157
+ end
158
+
159
+ all_embeddings = []
160
+ batch = Cpp.llama_batch_init(batch_size, 0, 1)
161
+ prompts_in_batch = []
162
+ current_batch_token_count = 0
163
+
164
+ process_batch = lambda do
165
+ next if prompts_in_batch.empty?
166
+
167
+ batch[:n_tokens] = current_batch_token_count
168
+
169
+ raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
170
+
171
+ prompts_in_batch.each do |seq_id_in_batch|
172
+ embd_ptr = Cpp.llama_get_embeddings_seq(@pointer, seq_id_in_batch)
173
+
174
+ raise Error, 'Failed to get embedding' if embd_ptr.null?
175
+
176
+ embedding = embd_ptr.read_array_of_float(@model.n_embd)
177
+
178
+ all_embeddings << (normalize ? normalize_embedding(embedding) : embedding)
179
+ end
180
+
181
+ current_batch_token_count = 0
182
+ prompts_in_batch = []
183
+ end
184
+
185
+ tokenized_strings.each do |tokens|
186
+ batch_full = (current_batch_token_count + tokens.size) > batch_size
187
+ seq_limit_reached = prompts_in_batch.size >= @model.n_seq_max
188
+ process_batch.call if !prompts_in_batch.empty? && (batch_full || seq_limit_reached)
189
+
190
+ seq_id = prompts_in_batch.size
191
+ prompts_in_batch << seq_id
192
+
193
+ tokens.each_with_index do |token_id, pos|
194
+ idx = current_batch_token_count
195
+
196
+ batch[:token].put_int32(idx * FFI.type_size(:int32), token_id)
197
+ batch[:pos].put_int32(idx * FFI.type_size(:int32), pos)
198
+ batch[:n_seq_id].put_int32(idx * FFI.type_size(:int32), 1)
199
+ batch[:seq_id].get_pointer(idx * FFI::Pointer.size).put_int32(0, seq_id)
200
+ batch[:logits].put_int8(idx, pos == tokens.size - 1 ? 1 : 0)
201
+
202
+ current_batch_token_count += 1
203
+ end
204
+ end
205
+
206
+ process_batch.call
207
+
208
+ Cpp.llama_batch_free(batch)
209
+
210
+ is_array ? all_embeddings : all_embeddings[0]
211
+ end
212
+
213
+ def embeddings?
214
+ @embeddings
215
+ end
216
+
217
+ def close
218
+ Cpp.llama_free(@pointer)
219
+ end
220
+
221
+ def norm(vec)
222
+ Math.sqrt(vec.sum { |x| x**2 })
223
+ end
224
+
225
+ def normalize_embedding(vec)
226
+ n = norm(vec)
227
+
228
+ return vec if n.zero?
229
+
230
+ vec.map { |x| x / n }
231
+ end
232
+ end
233
+ end