ruby_llm-red_candle 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +26 -0
- data/LICENSE.txt +21 -0
- data/README.md +378 -0
- data/Rakefile +10 -0
- data/examples/smoke_test.rb +320 -0
- data/lib/ruby_llm/red_candle/capabilities.rb +112 -0
- data/lib/ruby_llm/red_candle/chat.rb +445 -0
- data/lib/ruby_llm/red_candle/configuration.rb +38 -0
- data/lib/ruby_llm/red_candle/models.rb +120 -0
- data/lib/ruby_llm/red_candle/provider.rb +92 -0
- data/lib/ruby_llm/red_candle/schema_validator.rb +102 -0
- data/lib/ruby_llm/red_candle/streaming.rb +38 -0
- data/lib/ruby_llm/red_candle/version.rb +7 -0
- data/lib/ruby_llm-red_candle.rb +32 -0
- metadata +172 -0
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module RedCandle
|
|
5
|
+
# Chat implementation for Red Candle provider
|
|
6
|
+
module Chat
|
|
7
|
+
# Override the base complete method to handle local execution
|
|
8
|
+
def complete(messages, tools:, temperature:, model:, params: {}, headers: {}, schema: nil, &block)
|
|
9
|
+
_ = headers # Interface compatibility
|
|
10
|
+
payload = RubyLLM::Utils.deep_merge(
|
|
11
|
+
render_payload(
|
|
12
|
+
messages,
|
|
13
|
+
tools: tools,
|
|
14
|
+
temperature: temperature,
|
|
15
|
+
model: model,
|
|
16
|
+
stream: block_given?,
|
|
17
|
+
schema: schema
|
|
18
|
+
),
|
|
19
|
+
params
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if block_given?
|
|
23
|
+
perform_streaming_completion!(payload, &block)
|
|
24
|
+
else
|
|
25
|
+
result = perform_completion!(payload)
|
|
26
|
+
# Convert to Message object for compatibility
|
|
27
|
+
# Red Candle doesn't provide token counts by default, but we can estimate them
|
|
28
|
+
content = result[:content]
|
|
29
|
+
# Rough estimation: ~4 characters per token
|
|
30
|
+
estimated_output_tokens = (content.length / 4.0).round
|
|
31
|
+
estimated_input_tokens = estimate_input_tokens(payload[:messages])
|
|
32
|
+
|
|
33
|
+
RubyLLM::Message.new(
|
|
34
|
+
role: result[:role].to_sym,
|
|
35
|
+
content: content,
|
|
36
|
+
model_id: model.id,
|
|
37
|
+
input_tokens: estimated_input_tokens,
|
|
38
|
+
output_tokens: estimated_output_tokens
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def render_payload(messages, tools:, temperature:, model:, stream:, schema:)
|
|
44
|
+
# Red Candle doesn't support tools
|
|
45
|
+
if tools && !tools.empty?
|
|
46
|
+
raise RubyLLM::Error.new(nil, "Red Candle provider does not support tool calling")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
messages: messages,
|
|
51
|
+
temperature: temperature,
|
|
52
|
+
model: model.id,
|
|
53
|
+
stream: stream,
|
|
54
|
+
schema: schema
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def perform_completion!(payload)
|
|
59
|
+
model = ensure_model_loaded!(payload[:model])
|
|
60
|
+
messages = format_messages(payload[:messages])
|
|
61
|
+
|
|
62
|
+
# Handle structured generation differently - we need to build the prompt
|
|
63
|
+
# with JSON instructions BEFORE applying the chat template
|
|
64
|
+
response = if payload[:schema]
|
|
65
|
+
generate_with_schema(model, messages, payload[:schema], payload)
|
|
66
|
+
else
|
|
67
|
+
prompt = build_prompt(model, messages)
|
|
68
|
+
validate_context_length!(prompt, payload[:model])
|
|
69
|
+
config = build_generation_config(payload)
|
|
70
|
+
generate_with_error_handling(model, prompt, config, payload[:model])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
format_response(response, payload[:schema])
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def perform_streaming_completion!(payload, &block)
|
|
77
|
+
model = ensure_model_loaded!(payload[:model])
|
|
78
|
+
messages = format_messages(payload[:messages])
|
|
79
|
+
|
|
80
|
+
prompt = build_prompt(model, messages)
|
|
81
|
+
validate_context_length!(prompt, payload[:model])
|
|
82
|
+
config = build_generation_config(payload)
|
|
83
|
+
|
|
84
|
+
# Collect all streamed content
|
|
85
|
+
full_content = ""
|
|
86
|
+
|
|
87
|
+
# Stream tokens with error handling
|
|
88
|
+
stream_with_error_handling(model, prompt, config, payload[:model]) do |token|
|
|
89
|
+
full_content += token
|
|
90
|
+
chunk = format_stream_chunk(token)
|
|
91
|
+
block.call(chunk)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Send final chunk with empty content (indicates completion)
|
|
95
|
+
final_chunk = format_stream_chunk("")
|
|
96
|
+
block.call(final_chunk)
|
|
97
|
+
|
|
98
|
+
# Return a Message object with the complete response
|
|
99
|
+
estimated_output_tokens = (full_content.length / 4.0).round
|
|
100
|
+
estimated_input_tokens = estimate_input_tokens(payload[:messages])
|
|
101
|
+
|
|
102
|
+
RubyLLM::Message.new(
|
|
103
|
+
role: :assistant,
|
|
104
|
+
content: full_content,
|
|
105
|
+
model_id: payload[:model],
|
|
106
|
+
input_tokens: estimated_input_tokens,
|
|
107
|
+
output_tokens: estimated_output_tokens
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
# Build the prompt string from messages using the model's chat template
|
|
114
|
+
def build_prompt(model, messages)
|
|
115
|
+
if model.respond_to?(:apply_chat_template)
|
|
116
|
+
model.apply_chat_template(messages)
|
|
117
|
+
else
|
|
118
|
+
# Fallback to simple formatting
|
|
119
|
+
"#{messages.map { |m| "#{m[:role]}: #{m[:content]}" }.join("\n\n")}\n\nassistant:"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get generation parameters with consistent defaults
|
|
124
|
+
# @param payload [Hash] The request payload
|
|
125
|
+
# @param structured [Boolean] Whether this is for structured generation (uses different defaults)
|
|
126
|
+
# @return [Array<Float, Integer>] temperature and max_length values
|
|
127
|
+
def generation_params(payload, structured: false)
|
|
128
|
+
temperature = payload[:temperature] || (structured ? 0.3 : 0.7)
|
|
129
|
+
max_length = payload[:max_tokens] || (structured ? 1024 : 512)
|
|
130
|
+
[temperature, max_length]
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Build generation config with consistent defaults
|
|
134
|
+
# @param payload [Hash] The request payload
|
|
135
|
+
# @param structured [Boolean] Whether this is for structured generation (uses different defaults)
|
|
136
|
+
def build_generation_config(payload, structured: false)
|
|
137
|
+
temperature, max_length = generation_params(payload, structured: structured)
|
|
138
|
+
::Candle::GenerationConfig.balanced(
|
|
139
|
+
temperature: temperature,
|
|
140
|
+
max_length: max_length
|
|
141
|
+
)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def ensure_model_loaded!(model_id)
|
|
145
|
+
@loaded_models[model_id] ||= load_model(model_id)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def model_options(model_id)
|
|
149
|
+
# Get GGUF file and tokenizer if this is a GGUF model
|
|
150
|
+
# Access the methods from the Models module which is included in the provider
|
|
151
|
+
options = { device: @device }
|
|
152
|
+
options[:gguf_file] = gguf_file_for(model_id) if respond_to?(:gguf_file_for)
|
|
153
|
+
options[:tokenizer] = tokenizer_for(model_id) if respond_to?(:tokenizer_for)
|
|
154
|
+
options
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def load_model(model_id)
|
|
158
|
+
options = model_options(model_id)
|
|
159
|
+
::Candle::LLM.from_pretrained(model_id, **options)
|
|
160
|
+
rescue StandardError => e
|
|
161
|
+
if e.message.include?("Failed to find tokenizer")
|
|
162
|
+
raise RubyLLM::Error.new(nil, token_error_message(e, options[:tokenizer]))
|
|
163
|
+
elsif e.message.include?("Failed to find model")
|
|
164
|
+
raise RubyLLM::Error.new(nil, model_error_message(e, model_id))
|
|
165
|
+
else
|
|
166
|
+
raise RubyLLM::Error.new(nil, "Failed to load model #{model_id}: #{e.message}")
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def token_error_message(exception, tokenizer)
|
|
171
|
+
<<~ERROR_MESSAGE
|
|
172
|
+
Failed to load tokenizer '#{tokenizer}'. The tokenizer may not exist or require authentication.
|
|
173
|
+
Please verify the tokenizer exists at: https://huggingface.co/#{tokenizer}
|
|
174
|
+
And that you have accepted the terms of service for the tokenizer.
|
|
175
|
+
If it requires authentication, login with: huggingface-cli login
|
|
176
|
+
See https://github.com/scientist-labs/red-candle?tab=readme-ov-file#%EF%B8%8F-huggingface-login-warning
|
|
177
|
+
Original error: #{exception.message}"
|
|
178
|
+
ERROR_MESSAGE
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def model_error_message(exception, model_id)
|
|
182
|
+
<<~ERROR_MESSAGE
|
|
183
|
+
Failed to load model #{model_id}: #{exception.message}
|
|
184
|
+
Please verify the model exists at: https://huggingface.co/#{model_id}
|
|
185
|
+
And that you have accepted the terms of service for the model.
|
|
186
|
+
If it requires authentication, login with: huggingface-cli login
|
|
187
|
+
See https://github.com/scientist-labs/red-candle?tab=readme-ov-file#%EF%B8%8F-huggingface-login-warning
|
|
188
|
+
Original error: #{exception.message}"
|
|
189
|
+
ERROR_MESSAGE
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def generate_with_error_handling(model, prompt, config, model_id)
|
|
193
|
+
model.generate(prompt, config: config)
|
|
194
|
+
rescue StandardError => e
|
|
195
|
+
raise RubyLLM::Error.new(nil, generation_error_message(e, model_id))
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def stream_with_error_handling(model, prompt, config, model_id, &block)
|
|
199
|
+
model.generate_stream(prompt, config: config, &block)
|
|
200
|
+
rescue StandardError => e
|
|
201
|
+
raise RubyLLM::Error.new(nil, generation_error_message(e, model_id))
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def generation_error_message(exception, model_id)
|
|
205
|
+
message = exception.message.to_s
|
|
206
|
+
|
|
207
|
+
if message.include?("out of memory") || message.include?("OOM")
|
|
208
|
+
<<~ERROR_MESSAGE.strip
|
|
209
|
+
Out of memory while generating with #{model_id}.
|
|
210
|
+
Try using a smaller model or reducing the context length.
|
|
211
|
+
Original error: #{message}
|
|
212
|
+
ERROR_MESSAGE
|
|
213
|
+
elsif message.include?("context") || message.include?("sequence")
|
|
214
|
+
<<~ERROR_MESSAGE.strip
|
|
215
|
+
Context length exceeded for #{model_id}.
|
|
216
|
+
The input is too long for this model's context window.
|
|
217
|
+
Original error: #{message}
|
|
218
|
+
ERROR_MESSAGE
|
|
219
|
+
elsif message.include?("tensor") || message.include?("shape")
|
|
220
|
+
<<~ERROR_MESSAGE.strip
|
|
221
|
+
Model execution error for #{model_id}.
|
|
222
|
+
This may indicate an incompatible model format or corrupted weights.
|
|
223
|
+
Original error: #{message}
|
|
224
|
+
ERROR_MESSAGE
|
|
225
|
+
else
|
|
226
|
+
"Generation failed for #{model_id}: #{message}"
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def format_messages(messages)
|
|
231
|
+
messages.map do |msg|
|
|
232
|
+
# Handle both hash and Message objects
|
|
233
|
+
if msg.is_a?(RubyLLM::Message)
|
|
234
|
+
{
|
|
235
|
+
role: msg.role.to_s,
|
|
236
|
+
content: extract_message_content_from_object(msg)
|
|
237
|
+
}
|
|
238
|
+
else
|
|
239
|
+
{
|
|
240
|
+
role: msg[:role].to_s,
|
|
241
|
+
content: extract_message_content(msg)
|
|
242
|
+
}
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def extract_message_content_from_object(message)
|
|
248
|
+
content = message.content
|
|
249
|
+
|
|
250
|
+
# Handle Content objects
|
|
251
|
+
if content.is_a?(RubyLLM::Content)
|
|
252
|
+
# Extract text from Content object, including attachment text
|
|
253
|
+
handle_content_object(content)
|
|
254
|
+
elsif content.is_a?(String)
|
|
255
|
+
content
|
|
256
|
+
else
|
|
257
|
+
content.to_s
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
def extract_message_content(message)
|
|
262
|
+
content = message[:content]
|
|
263
|
+
|
|
264
|
+
# Handle Content objects
|
|
265
|
+
case content
|
|
266
|
+
when RubyLLM::Content
|
|
267
|
+
# Extract text from Content object
|
|
268
|
+
handle_content_object(content)
|
|
269
|
+
when String
|
|
270
|
+
content
|
|
271
|
+
when Array
|
|
272
|
+
# Handle array content (e.g., with images)
|
|
273
|
+
content.filter_map { |part| part[:text] if part[:type] == "text" }.join(" ")
|
|
274
|
+
else
|
|
275
|
+
content.to_s
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def handle_content_object(content)
|
|
280
|
+
text_parts = []
|
|
281
|
+
text_parts << content.text if content.text
|
|
282
|
+
|
|
283
|
+
# Add any text from attachments
|
|
284
|
+
content.attachments&.each do |attachment|
|
|
285
|
+
text_parts << attachment.data if attachment.respond_to?(:data) && attachment.data.is_a?(String)
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
text_parts.join(" ")
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def generate_with_schema(model, messages, schema, payload)
|
|
292
|
+
# Use Red Candle's native structured generation which uses the Rust outlines crate
|
|
293
|
+
# for grammar-constrained generation. This ensures valid JSON output.
|
|
294
|
+
|
|
295
|
+
# Normalize schema to ensure consistent symbol keys
|
|
296
|
+
normalized_schema = deep_symbolize_keys(schema)
|
|
297
|
+
|
|
298
|
+
# Validate schema before attempting generation
|
|
299
|
+
SchemaValidator.validate!(normalized_schema)
|
|
300
|
+
|
|
301
|
+
# Debug logging to help diagnose issues
|
|
302
|
+
RubyLLM.logger.debug "=== STRUCTURED GENERATION DEBUG ==="
|
|
303
|
+
RubyLLM.logger.debug "Original schema: #{schema.inspect}"
|
|
304
|
+
RubyLLM.logger.debug "Normalized schema: #{normalized_schema.inspect}"
|
|
305
|
+
RubyLLM.logger.debug "Messages: #{messages.inspect}"
|
|
306
|
+
|
|
307
|
+
# For structured generation, we modify the last user message to include
|
|
308
|
+
# JSON output instructions, then apply the chat template
|
|
309
|
+
structured_messages = build_structured_messages(messages, normalized_schema)
|
|
310
|
+
RubyLLM.logger.debug "Structured messages: #{structured_messages.inspect}"
|
|
311
|
+
|
|
312
|
+
prompt = build_prompt(model, structured_messages)
|
|
313
|
+
RubyLLM.logger.debug "Final prompt:\n#{prompt}"
|
|
314
|
+
RubyLLM.logger.debug "=== END DEBUG ==="
|
|
315
|
+
|
|
316
|
+
validate_context_length!(prompt, payload[:model])
|
|
317
|
+
|
|
318
|
+
# Get generation parameters (structured generation uses different defaults)
|
|
319
|
+
temperature, max_length = generation_params(payload, structured: true)
|
|
320
|
+
|
|
321
|
+
result = model.generate_structured(
|
|
322
|
+
prompt,
|
|
323
|
+
schema: normalized_schema,
|
|
324
|
+
temperature: temperature,
|
|
325
|
+
max_length: max_length,
|
|
326
|
+
warn_on_parse_error: true,
|
|
327
|
+
reset_cache: true
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
RubyLLM.logger.debug "Structured generation result: #{result.inspect}"
|
|
331
|
+
|
|
332
|
+
# generate_structured returns a Hash on success, or raw String on parse failure
|
|
333
|
+
result
|
|
334
|
+
rescue StandardError => e
|
|
335
|
+
# Log at debug level - the raised exception will inform the caller
|
|
336
|
+
RubyLLM.logger.debug "Structured generation failed: #{e.class}: #{e.message}"
|
|
337
|
+
RubyLLM.logger.debug e.backtrace.first(5).join("\n") if e.backtrace
|
|
338
|
+
raise RubyLLM::Error.new(nil, "Structured generation failed: #{e.message}")
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Recursively convert all hash keys to symbols
|
|
342
|
+
def deep_symbolize_keys(obj)
|
|
343
|
+
case obj
|
|
344
|
+
when Hash
|
|
345
|
+
obj.each_with_object({}) do |(key, value), result|
|
|
346
|
+
result[key.to_sym] = deep_symbolize_keys(value)
|
|
347
|
+
end
|
|
348
|
+
when Array
|
|
349
|
+
obj.map { |item| deep_symbolize_keys(item) }
|
|
350
|
+
else
|
|
351
|
+
obj
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def build_structured_messages(messages, schema)
|
|
356
|
+
# Clone messages to avoid modifying the original
|
|
357
|
+
modified_messages = messages.map(&:dup)
|
|
358
|
+
|
|
359
|
+
# Find the last user message and append JSON instructions
|
|
360
|
+
last_user_idx = modified_messages.rindex { |m| m[:role] == "user" }
|
|
361
|
+
return modified_messages unless last_user_idx
|
|
362
|
+
|
|
363
|
+
schema_description = describe_schema(schema)
|
|
364
|
+
json_instruction = Configuration.build_json_instruction(schema_description)
|
|
365
|
+
|
|
366
|
+
modified_messages[last_user_idx][:content] += json_instruction
|
|
367
|
+
modified_messages
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
def describe_schema(schema)
|
|
371
|
+
return "the requested data" unless schema.is_a?(Hash)
|
|
372
|
+
|
|
373
|
+
# Support both symbol and string keys for robustness
|
|
374
|
+
properties = schema[:properties] || schema["properties"]
|
|
375
|
+
return "the requested data" unless properties
|
|
376
|
+
|
|
377
|
+
properties.map do |key, value|
|
|
378
|
+
type = value[:type] || value["type"] || "any"
|
|
379
|
+
enum = value[:enum] || value["enum"]
|
|
380
|
+
if enum
|
|
381
|
+
"#{key} (#{type}, one of: #{enum.join(', ')})"
|
|
382
|
+
else
|
|
383
|
+
"#{key} (#{type})"
|
|
384
|
+
end
|
|
385
|
+
end.join(", ")
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
def format_response(response, schema)
|
|
389
|
+
content = if schema && !response.is_a?(String)
|
|
390
|
+
# Structured response
|
|
391
|
+
JSON.generate(response)
|
|
392
|
+
else
|
|
393
|
+
response
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
{
|
|
397
|
+
content: content,
|
|
398
|
+
role: "assistant"
|
|
399
|
+
}
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def format_stream_chunk(token)
|
|
403
|
+
# Return a Chunk object for streaming compatibility
|
|
404
|
+
RubyLLM::Chunk.new(
|
|
405
|
+
role: :assistant,
|
|
406
|
+
content: token
|
|
407
|
+
)
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def estimate_input_tokens(messages)
|
|
411
|
+
# Rough estimation: ~4 characters per token
|
|
412
|
+
formatted = format_messages(messages)
|
|
413
|
+
total_chars = formatted.sum { |msg| "#{msg[:role]}: #{msg[:content]}".length }
|
|
414
|
+
(total_chars / 4.0).round
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
def validate_context_length!(prompt, model_id)
|
|
418
|
+
# Get the context window for this model
|
|
419
|
+
context_window = if respond_to?(:model_context_window)
|
|
420
|
+
model_context_window(model_id)
|
|
421
|
+
else
|
|
422
|
+
4096 # Conservative default
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
# Estimate tokens in prompt (~4 characters per token)
|
|
426
|
+
estimated_tokens = (prompt.length / 4.0).round
|
|
427
|
+
|
|
428
|
+
# Check if prompt exceeds context window (leave some room for response)
|
|
429
|
+
max_input_tokens = context_window - 512 # Reserve 512 tokens for response
|
|
430
|
+
return unless estimated_tokens > max_input_tokens
|
|
431
|
+
|
|
432
|
+
raise RubyLLM::Error.new(
|
|
433
|
+
nil,
|
|
434
|
+
"Context length exceeded. Estimated #{estimated_tokens} tokens, " \
|
|
435
|
+
"but model #{model_id} has a context window of #{context_window} tokens."
|
|
436
|
+
)
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
# Delegate to Capabilities module for context window lookup
|
|
440
|
+
def model_context_window(model_id)
|
|
441
|
+
Capabilities.model_context_window(model_id)
|
|
442
|
+
end
|
|
443
|
+
end
|
|
444
|
+
end
|
|
445
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module RedCandle
|
|
5
|
+
# Configuration options for Red Candle provider
|
|
6
|
+
module Configuration
|
|
7
|
+
# Default JSON instruction template for structured generation
|
|
8
|
+
# Use {schema_description} as a placeholder for the schema description
|
|
9
|
+
DEFAULT_JSON_INSTRUCTION = "\n\nRespond with ONLY a valid JSON object containing: {schema_description}"
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
# Get the JSON instruction template
|
|
13
|
+
# @return [String] the template with {schema_description} placeholder
|
|
14
|
+
def json_instruction_template
|
|
15
|
+
@json_instruction_template || DEFAULT_JSON_INSTRUCTION
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Set a custom JSON instruction template
|
|
19
|
+
# @param template [String] the template with {schema_description} placeholder
|
|
20
|
+
def json_instruction_template=(template)
|
|
21
|
+
@json_instruction_template = template
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Reset configuration to defaults
|
|
25
|
+
def reset!
|
|
26
|
+
@json_instruction_template = nil
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Build the JSON instruction by substituting the schema description
|
|
30
|
+
# @param schema_description [String] the human-readable schema description
|
|
31
|
+
# @return [String] the formatted instruction
|
|
32
|
+
def build_json_instruction(schema_description)
|
|
33
|
+
json_instruction_template.gsub("{schema_description}", schema_description)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module RedCandle
|
|
5
|
+
# Models methods of the RedCandle integration
|
|
6
|
+
module Models
|
|
7
|
+
# TODO: red-candle supports more models, but let's start with some well tested ones.
|
|
8
|
+
SUPPORTED_MODELS = [
|
|
9
|
+
{
|
|
10
|
+
id: "google/gemma-3-4b-it-qat-q4_0-gguf",
|
|
11
|
+
name: "Gemma 3 4B Instruct (Quantized)",
|
|
12
|
+
gguf_file: "gemma-3-4b-it-q4_0.gguf",
|
|
13
|
+
tokenizer: "google/gemma-3-4b-it", # Tokenizer from base model
|
|
14
|
+
context_window: 8192,
|
|
15
|
+
family: "gemma",
|
|
16
|
+
architecture: "gemma2",
|
|
17
|
+
supports_chat: true,
|
|
18
|
+
supports_structured: true
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
id: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
|
22
|
+
name: "TinyLlama 1.1B Chat (Quantized)",
|
|
23
|
+
gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
24
|
+
context_window: 2048,
|
|
25
|
+
family: "llama",
|
|
26
|
+
architecture: "llama",
|
|
27
|
+
supports_chat: true,
|
|
28
|
+
supports_structured: true
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
id: "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
|
32
|
+
name: "Mistral 7B Instruct v0.2 (Quantized)",
|
|
33
|
+
gguf_file: "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
|
|
34
|
+
tokenizer: "mistralai/Mistral-7B-Instruct-v0.2",
|
|
35
|
+
context_window: 32_768,
|
|
36
|
+
family: "mistral",
|
|
37
|
+
architecture: "mistral",
|
|
38
|
+
supports_chat: true,
|
|
39
|
+
supports_structured: true
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
id: "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
|
|
43
|
+
name: "Qwen 2.5 1.5B Instruct (Quantized)",
|
|
44
|
+
gguf_file: "qwen2.5-1.5b-instruct-q4_k_m.gguf",
|
|
45
|
+
context_window: 32_768,
|
|
46
|
+
family: "qwen2",
|
|
47
|
+
architecture: "qwen2",
|
|
48
|
+
supports_chat: true,
|
|
49
|
+
supports_structured: true
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
id: "microsoft/Phi-3-mini-4k-instruct",
|
|
53
|
+
name: "Phi 3 Mini 4K Instruct",
|
|
54
|
+
context_window: 4096,
|
|
55
|
+
family: "phi",
|
|
56
|
+
architecture: "phi",
|
|
57
|
+
supports_chat: true,
|
|
58
|
+
supports_structured: true
|
|
59
|
+
}
|
|
60
|
+
].freeze
|
|
61
|
+
|
|
62
|
+
def list_models
|
|
63
|
+
SUPPORTED_MODELS.map do |model_data|
|
|
64
|
+
RubyLLM::Model::Info.new(
|
|
65
|
+
id: model_data[:id],
|
|
66
|
+
name: model_data[:name],
|
|
67
|
+
provider: slug,
|
|
68
|
+
family: model_data[:family],
|
|
69
|
+
context_window: model_data[:context_window],
|
|
70
|
+
capabilities: %w[streaming structured_output],
|
|
71
|
+
modalities: { input: %w[text], output: %w[text] }
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def models
|
|
77
|
+
@models ||= list_models
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def model(id)
|
|
81
|
+
models.find { |m| m.id == id } ||
|
|
82
|
+
raise(RubyLLM::Error.new(nil,
|
|
83
|
+
"Model #{id} not found in Red Candle provider. " \
|
|
84
|
+
"Available models: #{model_ids.join(', ')}"))
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def model_available?(id)
|
|
88
|
+
SUPPORTED_MODELS.any? { |m| m[:id] == id }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def model_ids
|
|
92
|
+
SUPPORTED_MODELS.map { |m| m[:id] }
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def model_info(id)
|
|
96
|
+
SUPPORTED_MODELS.find { |m| m[:id] == id }
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def supports_chat?(model_id)
|
|
100
|
+
info = model_info(model_id)
|
|
101
|
+
info ? info[:supports_chat] : false
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def supports_structured?(model_id)
|
|
105
|
+
info = model_info(model_id)
|
|
106
|
+
info ? info[:supports_structured] : false
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def gguf_file_for(model_id)
|
|
110
|
+
info = model_info(model_id)
|
|
111
|
+
info ? info[:gguf_file] : nil
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def tokenizer_for(model_id)
|
|
115
|
+
info = model_info(model_id)
|
|
116
|
+
info ? info[:tokenizer] : nil
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "candle"
|
|
4
|
+
|
|
5
|
+
module RubyLLM
|
|
6
|
+
module RedCandle
|
|
7
|
+
# Red Candle provider for local LLM execution using the Candle Rust crate.
|
|
8
|
+
class Provider < RubyLLM::Provider
|
|
9
|
+
include Chat
|
|
10
|
+
include Models
|
|
11
|
+
include Capabilities
|
|
12
|
+
include Streaming
|
|
13
|
+
|
|
14
|
+
def initialize(config)
|
|
15
|
+
ensure_red_candle_available!
|
|
16
|
+
super
|
|
17
|
+
@loaded_models = {} # Cache for loaded models
|
|
18
|
+
@device = determine_device(config)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def api_base
|
|
22
|
+
nil # Local execution, no API base needed
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def headers
|
|
26
|
+
{} # No HTTP headers needed
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
def capabilities
|
|
31
|
+
Capabilities
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def configuration_requirements
|
|
35
|
+
[] # No required config, device is optional
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def local?
|
|
39
|
+
true
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def supports_functions?(model_id = nil)
|
|
43
|
+
Capabilities.supports_functions?(model_id)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def models
|
|
47
|
+
# Return Red Candle models for registration
|
|
48
|
+
Models::SUPPORTED_MODELS.map do |model_data|
|
|
49
|
+
RubyLLM::Model::Info.new(
|
|
50
|
+
id: model_data[:id],
|
|
51
|
+
name: model_data[:name],
|
|
52
|
+
provider: "red_candle",
|
|
53
|
+
type: "chat",
|
|
54
|
+
family: model_data[:family],
|
|
55
|
+
context_window: model_data[:context_window],
|
|
56
|
+
capabilities: %w[streaming structured_output],
|
|
57
|
+
modalities: { input: %w[text], output: %w[text] }
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def ensure_red_candle_available!
|
|
66
|
+
require "candle"
|
|
67
|
+
rescue LoadError
|
|
68
|
+
raise RubyLLM::Error.new(nil, "Red Candle gem is not installed. Add 'gem \"red-candle\"' to your Gemfile.")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def determine_device(config)
|
|
72
|
+
if config.respond_to?(:red_candle_device) && config.red_candle_device
|
|
73
|
+
case config.red_candle_device.to_s.downcase
|
|
74
|
+
when "cpu"
|
|
75
|
+
::Candle::Device.cpu
|
|
76
|
+
when "cuda", "gpu"
|
|
77
|
+
::Candle::Device.cuda
|
|
78
|
+
when "metal"
|
|
79
|
+
::Candle::Device.metal
|
|
80
|
+
else
|
|
81
|
+
::Candle::Device.best
|
|
82
|
+
end
|
|
83
|
+
else
|
|
84
|
+
::Candle::Device.best
|
|
85
|
+
end
|
|
86
|
+
rescue StandardError => e
|
|
87
|
+
RubyLLM.logger.warn "Failed to initialize device: #{e.message}. Falling back to CPU."
|
|
88
|
+
::Candle::Device.cpu
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|