red-candle 1.2.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +460 -379
- data/README.md +1 -1
- data/ext/candle/Cargo.toml +3 -3
- data/ext/candle/src/llm/constrained_generation_test.rs +79 -0
- data/ext/candle/src/llm/gemma.rs +24 -9
- data/ext/candle/src/llm/llama.rs +46 -10
- data/ext/candle/src/llm/mistral.rs +46 -10
- data/ext/candle/src/llm/phi.rs +76 -8
- data/ext/candle/src/llm/qwen.rs +23 -10
- data/ext/candle/src/llm/text_generation.rs +40 -50
- data/ext/candle/src/ruby/llm.rs +62 -29
- data/ext/candle/src/ruby/structured.rs +54 -10
- data/lib/candle/llm.rb +77 -3
- data/lib/candle/version.rb +1 -1
- metadata +11 -13
- data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt +0 -1
- data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs +0 -355
- data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs +0 -276
- data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs +0 -49
- data/ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs +0 -2748
- data/ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs +0 -8902
|
@@ -148,47 +148,28 @@ impl TextGeneration {
|
|
|
148
148
|
if let (Some(ref constraint_index), Some(current_state)) = (&self.constraint, self.constraint_state) {
|
|
149
149
|
// Get the next state
|
|
150
150
|
let next_state = constraint_index.next_state(¤t_state, &next_token);
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
// Check if we're transitioning to a state with no allowed tokens (completion)
|
|
153
153
|
if !self.constraint_completed && self.tokens.len() > self.tokens_since_constraint_start {
|
|
154
|
-
// Check if
|
|
155
|
-
// This happens when the pattern is complete and the FSM allows "anything"
|
|
156
|
-
|
|
157
|
-
let current_constrained = if let Some(allowed) = constraint_index.allowed_tokens(¤t_state) {
|
|
158
|
-
// Consider it constrained if we have a limited set of allowed tokens
|
|
159
|
-
allowed.len() < 1000 // Arbitrary threshold for "constrained"
|
|
160
|
-
} else {
|
|
161
|
-
true // No tokens allowed is definitely constrained
|
|
162
|
-
};
|
|
163
|
-
|
|
164
|
-
let next_constrained = if let Some(next_state_val) = next_state {
|
|
165
|
-
if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
|
|
166
|
-
allowed.is_empty() || allowed.len() < 1000
|
|
167
|
-
} else {
|
|
168
|
-
true
|
|
169
|
-
}
|
|
170
|
-
} else {
|
|
171
|
-
true
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
// If we're transitioning from constrained to unconstrained, we've completed the pattern
|
|
175
|
-
if current_constrained && !next_constrained {
|
|
176
|
-
self.constraint_completed = true;
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
// Also check if next state has no allowed tokens at all
|
|
154
|
+
// Check if next state has no allowed tokens at all - this is definitive completion
|
|
180
155
|
if let Some(next_state_val) = next_state {
|
|
181
156
|
if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
|
|
182
157
|
if allowed.is_empty() {
|
|
183
158
|
self.constraint_completed = true;
|
|
184
159
|
}
|
|
160
|
+
// Only mark as complete if ONLY EOS is allowed (not just if EOS is one of many options)
|
|
161
|
+
else if let Some(eos) = self.eos_token_id {
|
|
162
|
+
if allowed.len() == 1 && allowed.contains(&eos) {
|
|
163
|
+
self.constraint_completed = true;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
185
166
|
} else {
|
|
186
167
|
// None means no tokens allowed - constraint is complete
|
|
187
168
|
self.constraint_completed = true;
|
|
188
169
|
}
|
|
189
170
|
}
|
|
190
171
|
}
|
|
191
|
-
|
|
172
|
+
|
|
192
173
|
self.constraint_state = next_state;
|
|
193
174
|
}
|
|
194
175
|
|
|
@@ -201,22 +182,22 @@ impl TextGeneration {
|
|
|
201
182
|
if self.constraint_completed {
|
|
202
183
|
return true;
|
|
203
184
|
}
|
|
204
|
-
|
|
185
|
+
|
|
205
186
|
// Also check the current state
|
|
206
187
|
if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
|
|
207
|
-
// Check if the constraint has reached a state where it
|
|
208
|
-
// This happens when
|
|
209
|
-
// 1. We have no more allowed tokens (constraint fully satisfied)
|
|
210
|
-
// 2. The EOS token is in the allowed tokens (optional ending)
|
|
188
|
+
// Check if the constraint has reached a state where it MUST end
|
|
189
|
+
// This happens when there are no more allowed tokens (constraint fully satisfied)
|
|
211
190
|
if let Some(allowed) = constraint_index.allowed_tokens(&state) {
|
|
212
191
|
// If no tokens are allowed, the constraint is fully satisfied
|
|
213
192
|
if allowed.is_empty() {
|
|
214
193
|
return true;
|
|
215
194
|
}
|
|
216
|
-
|
|
217
|
-
//
|
|
195
|
+
|
|
196
|
+
// For JSON schemas, check if ONLY the EOS token is allowed
|
|
197
|
+
// This means we've generated a complete, valid JSON structure
|
|
198
|
+
// Don't treat EOS as a satisfaction signal if other tokens are also allowed
|
|
218
199
|
if let Some(eos) = self.eos_token_id {
|
|
219
|
-
if allowed.contains(&eos) {
|
|
200
|
+
if allowed.len() == 1 && allowed.contains(&eos) {
|
|
220
201
|
return true;
|
|
221
202
|
}
|
|
222
203
|
}
|
|
@@ -229,28 +210,37 @@ impl TextGeneration {
|
|
|
229
210
|
}
|
|
230
211
|
|
|
231
212
|
/// Check if the constraint is satisfied when stop_on_match is true
|
|
213
|
+
/// NOTE: For JSON schemas, this should only return true when the JSON structure is complete,
|
|
214
|
+
/// not just because we're in a state with many allowed tokens (like inside a string).
|
|
232
215
|
pub fn is_constraint_satisfied_stop_on_match(&self) -> bool {
|
|
233
216
|
// When stop_on_match is true, we stop as soon as the constraint is completed
|
|
234
217
|
if self.constraint_completed {
|
|
235
218
|
return true;
|
|
236
219
|
}
|
|
237
|
-
|
|
238
|
-
//
|
|
239
|
-
//
|
|
240
|
-
//
|
|
220
|
+
|
|
221
|
+
// For JSON and other structured outputs, don't use the "large allowed set" heuristic.
|
|
222
|
+
// Instead, only consider the constraint satisfied when:
|
|
223
|
+
// 1. There are no allowed tokens (definitive completion)
|
|
224
|
+
// 2. Only EOS is allowed (completion with optional termination)
|
|
241
225
|
if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
if
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
226
|
+
if let Some(allowed) = constraint_index.allowed_tokens(&state) {
|
|
227
|
+
// No more tokens allowed - definitely complete
|
|
228
|
+
if allowed.is_empty() {
|
|
229
|
+
return true;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Only EOS is allowed - complete JSON structure
|
|
233
|
+
if let Some(eos) = self.eos_token_id {
|
|
234
|
+
if allowed.len() == 1 && allowed.contains(&eos) {
|
|
248
235
|
return true;
|
|
249
236
|
}
|
|
250
237
|
}
|
|
238
|
+
} else {
|
|
239
|
+
// None means no tokens allowed - constraint is complete
|
|
240
|
+
return true;
|
|
251
241
|
}
|
|
252
242
|
}
|
|
253
|
-
|
|
243
|
+
|
|
254
244
|
false
|
|
255
245
|
}
|
|
256
246
|
|
|
@@ -259,13 +249,13 @@ impl TextGeneration {
|
|
|
259
249
|
if self.tokens.len() >= max_length {
|
|
260
250
|
return true;
|
|
261
251
|
}
|
|
262
|
-
|
|
252
|
+
|
|
263
253
|
if let Some(eos) = self.eos_token_id {
|
|
264
254
|
if token == eos {
|
|
265
255
|
return true;
|
|
266
256
|
}
|
|
267
257
|
}
|
|
268
|
-
|
|
258
|
+
|
|
269
259
|
// Check if we've reached a final state in constraint
|
|
270
260
|
// A state is considered final if it has no allowed tokens
|
|
271
261
|
if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
|
|
@@ -278,7 +268,7 @@ impl TextGeneration {
|
|
|
278
268
|
return true;
|
|
279
269
|
}
|
|
280
270
|
}
|
|
281
|
-
|
|
271
|
+
|
|
282
272
|
false
|
|
283
273
|
}
|
|
284
274
|
|
data/ext/candle/src/ruby/llm.rs
CHANGED
|
@@ -257,14 +257,15 @@ impl LLM {
|
|
|
257
257
|
let model_lower = model_id.to_lowercase();
|
|
258
258
|
let is_quantized = model_lower.contains("gguf") || model_lower.contains("-q4") || model_lower.contains("-q5") || model_lower.contains("-q8");
|
|
259
259
|
|
|
260
|
+
// Extract tokenizer source if provided in model_id (for both GGUF and regular models)
|
|
261
|
+
let (model_id_clean, tokenizer_source) = if let Some(pos) = model_id.find("@@") {
|
|
262
|
+
let (id, _tok) = model_id.split_at(pos);
|
|
263
|
+
(id.to_string(), Some(&model_id[pos+2..]))
|
|
264
|
+
} else {
|
|
265
|
+
(model_id.clone(), None)
|
|
266
|
+
};
|
|
267
|
+
|
|
260
268
|
let model = if is_quantized {
|
|
261
|
-
// Extract tokenizer source if provided in model_id
|
|
262
|
-
let (model_id_clean, tokenizer_source) = if let Some(pos) = model_id.find("@@") {
|
|
263
|
-
let (id, _tok) = model_id.split_at(pos);
|
|
264
|
-
(id.to_string(), Some(&model_id[pos+2..]))
|
|
265
|
-
} else {
|
|
266
|
-
(model_id.clone(), None)
|
|
267
|
-
};
|
|
268
269
|
|
|
269
270
|
// Use unified GGUF loader for all quantized models
|
|
270
271
|
let gguf_model = rt.block_on(async {
|
|
@@ -273,41 +274,73 @@ impl LLM {
|
|
|
273
274
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load GGUF model: {}", e)))?;
|
|
274
275
|
ModelType::QuantizedGGUF(gguf_model)
|
|
275
276
|
} else {
|
|
276
|
-
// Load non-quantized models
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
277
|
+
// Load non-quantized models based on type
|
|
278
|
+
let model_lower_clean = model_id_clean.to_lowercase();
|
|
279
|
+
|
|
280
|
+
if model_lower_clean.contains("mistral") {
|
|
281
|
+
let mistral = if tokenizer_source.is_some() {
|
|
282
|
+
rt.block_on(async {
|
|
283
|
+
RustMistral::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
|
|
284
|
+
})
|
|
285
|
+
} else {
|
|
286
|
+
rt.block_on(async {
|
|
287
|
+
RustMistral::from_pretrained(&model_id_clean, candle_device).await
|
|
288
|
+
})
|
|
289
|
+
}
|
|
281
290
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
|
|
282
291
|
ModelType::Mistral(mistral)
|
|
283
|
-
} else if
|
|
284
|
-
let llama =
|
|
285
|
-
|
|
286
|
-
|
|
292
|
+
} else if model_lower_clean.contains("llama") || model_lower_clean.contains("meta-llama") || model_lower_clean.contains("tinyllama") {
|
|
293
|
+
let llama = if tokenizer_source.is_some() {
|
|
294
|
+
rt.block_on(async {
|
|
295
|
+
RustLlama::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
|
|
296
|
+
})
|
|
297
|
+
} else {
|
|
298
|
+
rt.block_on(async {
|
|
299
|
+
RustLlama::from_pretrained(&model_id_clean, candle_device).await
|
|
300
|
+
})
|
|
301
|
+
}
|
|
287
302
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
|
|
288
303
|
ModelType::Llama(llama)
|
|
289
|
-
} else if
|
|
290
|
-
let gemma =
|
|
291
|
-
|
|
292
|
-
|
|
304
|
+
} else if model_lower_clean.contains("gemma") || model_lower_clean.contains("google/gemma") {
|
|
305
|
+
let gemma = if tokenizer_source.is_some() {
|
|
306
|
+
rt.block_on(async {
|
|
307
|
+
RustGemma::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
|
|
308
|
+
})
|
|
309
|
+
} else {
|
|
310
|
+
rt.block_on(async {
|
|
311
|
+
RustGemma::from_pretrained(&model_id_clean, candle_device).await
|
|
312
|
+
})
|
|
313
|
+
}
|
|
293
314
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
|
|
294
315
|
ModelType::Gemma(gemma)
|
|
295
|
-
} else if
|
|
296
|
-
let qwen =
|
|
297
|
-
|
|
298
|
-
|
|
316
|
+
} else if model_lower_clean.contains("qwen") {
|
|
317
|
+
let qwen = if tokenizer_source.is_some() {
|
|
318
|
+
rt.block_on(async {
|
|
319
|
+
RustQwen::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
|
|
320
|
+
})
|
|
321
|
+
} else {
|
|
322
|
+
rt.block_on(async {
|
|
323
|
+
RustQwen::from_pretrained(&model_id_clean, candle_device).await
|
|
324
|
+
})
|
|
325
|
+
}
|
|
299
326
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
|
|
300
327
|
ModelType::Qwen(qwen)
|
|
301
|
-
} else if
|
|
302
|
-
let phi =
|
|
303
|
-
|
|
304
|
-
|
|
328
|
+
} else if model_lower_clean.contains("phi") {
|
|
329
|
+
let phi = if tokenizer_source.is_some() {
|
|
330
|
+
rt.block_on(async {
|
|
331
|
+
RustPhi::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
|
|
332
|
+
})
|
|
333
|
+
} else {
|
|
334
|
+
rt.block_on(async {
|
|
335
|
+
RustPhi::from_pretrained(&model_id_clean, candle_device).await
|
|
336
|
+
})
|
|
337
|
+
}
|
|
305
338
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
|
|
306
339
|
ModelType::Phi(phi)
|
|
307
340
|
} else {
|
|
308
341
|
return Err(Error::new(
|
|
309
342
|
magnus::exception::runtime_error(),
|
|
310
|
-
format!("Unsupported model type: {}. Currently Mistral, Llama, Gemma, Qwen, and Phi models are supported.",
|
|
343
|
+
format!("Unsupported model type: {}. Currently Mistral, Llama, Gemma, Qwen, and Phi models are supported.", model_id_clean),
|
|
311
344
|
));
|
|
312
345
|
}
|
|
313
346
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
use magnus::{Error, Module, RModule, function, Object};
|
|
2
2
|
use std::sync::Arc;
|
|
3
3
|
|
|
4
|
-
use crate::structured::{SchemaProcessor, VocabularyAdapter, Index};
|
|
4
|
+
use crate::structured::{SchemaProcessor, VocabularyAdapter, Index, Vocabulary};
|
|
5
5
|
use crate::ruby::{Result, tokenizer::Tokenizer};
|
|
6
6
|
|
|
7
7
|
/// Ruby wrapper for structured generation constraints
|
|
@@ -12,36 +12,80 @@ pub struct StructuredConstraint {
|
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
impl StructuredConstraint {
|
|
15
|
-
/// Create a constraint from a JSON schema
|
|
15
|
+
/// Create a constraint from a JSON schema using a model ID
|
|
16
|
+
/// This uses Vocabulary::from_pretrained which handles tokenizer byte encoding correctly
|
|
17
|
+
pub fn from_schema_with_model(schema: String, model_id: String) -> Result<Self> {
|
|
18
|
+
// Use tokio runtime for async vocabulary loading
|
|
19
|
+
let rt = tokio::runtime::Runtime::new()
|
|
20
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create runtime: {}", e)))?;
|
|
21
|
+
|
|
22
|
+
let vocabulary = rt.block_on(async {
|
|
23
|
+
Vocabulary::from_pretrained(&model_id, None)
|
|
24
|
+
})
|
|
25
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary from model '{}': {:?}", model_id, e)))?;
|
|
26
|
+
|
|
27
|
+
let processor = SchemaProcessor::new();
|
|
28
|
+
let index = processor.process_schema(&schema, &vocabulary)
|
|
29
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process schema: {}", e)))?;
|
|
30
|
+
|
|
31
|
+
Ok(Self { index })
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Create a constraint from a regex pattern using a model ID
|
|
35
|
+
pub fn from_regex_with_model(pattern: String, model_id: String) -> Result<Self> {
|
|
36
|
+
// Use tokio runtime for async vocabulary loading
|
|
37
|
+
let rt = tokio::runtime::Runtime::new()
|
|
38
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create runtime: {}", e)))?;
|
|
39
|
+
|
|
40
|
+
let vocabulary = rt.block_on(async {
|
|
41
|
+
Vocabulary::from_pretrained(&model_id, None)
|
|
42
|
+
})
|
|
43
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary from model '{}': {:?}", model_id, e)))?;
|
|
44
|
+
|
|
45
|
+
let processor = SchemaProcessor::new();
|
|
46
|
+
let index = processor.process_regex(&pattern, &vocabulary)
|
|
47
|
+
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process regex: {}", e)))?;
|
|
48
|
+
|
|
49
|
+
Ok(Self { index })
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/// Create a constraint from a JSON schema (legacy method using tokenizer directly)
|
|
53
|
+
/// Note: This may not handle all tokenizer byte encodings correctly
|
|
16
54
|
pub fn from_schema(schema: String, tokenizer: &Tokenizer) -> Result<Self> {
|
|
17
55
|
let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
|
|
18
56
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
|
|
19
|
-
|
|
57
|
+
|
|
20
58
|
let processor = SchemaProcessor::new();
|
|
21
59
|
let index = processor.process_schema(&schema, &vocabulary)
|
|
22
60
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process schema: {}", e)))?;
|
|
23
|
-
|
|
61
|
+
|
|
24
62
|
Ok(Self { index })
|
|
25
63
|
}
|
|
26
|
-
|
|
27
|
-
/// Create a constraint from a regex pattern
|
|
64
|
+
|
|
65
|
+
/// Create a constraint from a regex pattern (legacy method using tokenizer directly)
|
|
66
|
+
/// Note: This may not handle all tokenizer byte encodings correctly
|
|
28
67
|
pub fn from_regex(pattern: String, tokenizer: &Tokenizer) -> Result<Self> {
|
|
29
68
|
let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
|
|
30
69
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
|
|
31
|
-
|
|
70
|
+
|
|
32
71
|
let processor = SchemaProcessor::new();
|
|
33
72
|
let index = processor.process_regex(&pattern, &vocabulary)
|
|
34
73
|
.map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process regex: {}", e)))?;
|
|
35
|
-
|
|
74
|
+
|
|
36
75
|
Ok(Self { index })
|
|
37
76
|
}
|
|
38
77
|
}
|
|
39
78
|
|
|
40
79
|
pub fn init_structured(rb_candle: RModule) -> Result<()> {
|
|
41
80
|
let class = rb_candle.define_class("StructuredConstraint", magnus::class::object())?;
|
|
42
|
-
|
|
81
|
+
|
|
82
|
+
// New methods using model_id for proper vocabulary loading
|
|
83
|
+
class.define_singleton_method("from_schema_with_model", function!(StructuredConstraint::from_schema_with_model, 2))?;
|
|
84
|
+
class.define_singleton_method("from_regex_with_model", function!(StructuredConstraint::from_regex_with_model, 2))?;
|
|
85
|
+
|
|
86
|
+
// Legacy methods using tokenizer directly (may have byte encoding issues with some models)
|
|
43
87
|
class.define_singleton_method("from_schema", function!(StructuredConstraint::from_schema, 2))?;
|
|
44
88
|
class.define_singleton_method("from_regex", function!(StructuredConstraint::from_regex, 2))?;
|
|
45
|
-
|
|
89
|
+
|
|
46
90
|
Ok(())
|
|
47
91
|
}
|
data/lib/candle/llm.rb
CHANGED
|
@@ -32,16 +32,90 @@ module Candle
|
|
|
32
32
|
end
|
|
33
33
|
end
|
|
34
34
|
# Create a structured constraint from a JSON schema
|
|
35
|
+
# Uses the model's vocabulary with proper byte encoding handling
|
|
35
36
|
def constraint_from_schema(schema)
|
|
36
37
|
schema_str = schema.is_a?(String) ? schema : JSON.generate(schema)
|
|
37
|
-
|
|
38
|
+
|
|
39
|
+
# Extract the tokenizer source model ID for proper vocabulary loading
|
|
40
|
+
tokenizer_model = tokenizer_source_model
|
|
41
|
+
if tokenizer_model
|
|
42
|
+
begin
|
|
43
|
+
StructuredConstraint.from_schema_with_model(schema_str, tokenizer_model)
|
|
44
|
+
rescue RuntimeError => e
|
|
45
|
+
# Fall back to legacy method if from_pretrained fails
|
|
46
|
+
# (e.g., tokenizer doesn't have EOS token in expected format)
|
|
47
|
+
if e.message.include?("UnsupportedTokenizer")
|
|
48
|
+
StructuredConstraint.from_schema(schema_str, tokenizer)
|
|
49
|
+
else
|
|
50
|
+
raise
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
else
|
|
54
|
+
# Fall back to legacy method if we can't determine the model
|
|
55
|
+
StructuredConstraint.from_schema(schema_str, tokenizer)
|
|
56
|
+
end
|
|
38
57
|
end
|
|
39
|
-
|
|
58
|
+
|
|
40
59
|
# Create a structured constraint from a regex pattern
|
|
60
|
+
# Uses the model's vocabulary with proper byte encoding handling
|
|
41
61
|
def constraint_from_regex(pattern)
|
|
42
62
|
pattern_str = pattern.is_a?(Regexp) ? pattern.source : pattern.to_s
|
|
43
|
-
|
|
63
|
+
|
|
64
|
+
# Extract the tokenizer source model ID for proper vocabulary loading
|
|
65
|
+
tokenizer_model = tokenizer_source_model
|
|
66
|
+
if tokenizer_model
|
|
67
|
+
begin
|
|
68
|
+
StructuredConstraint.from_regex_with_model(pattern_str, tokenizer_model)
|
|
69
|
+
rescue RuntimeError => e
|
|
70
|
+
# Fall back to legacy method if from_pretrained fails
|
|
71
|
+
if e.message.include?("UnsupportedTokenizer")
|
|
72
|
+
StructuredConstraint.from_regex(pattern_str, tokenizer)
|
|
73
|
+
else
|
|
74
|
+
raise
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
else
|
|
78
|
+
# Fall back to legacy method if we can't determine the model
|
|
79
|
+
StructuredConstraint.from_regex(pattern_str, tokenizer)
|
|
80
|
+
end
|
|
44
81
|
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
# Get the model ID to use for vocabulary loading
|
|
86
|
+
# This handles GGUF models by extracting the tokenizer source
|
|
87
|
+
def tokenizer_source_model
|
|
88
|
+
opts = options rescue {}
|
|
89
|
+
|
|
90
|
+
# For GGUF models, use the tokenizer source if available
|
|
91
|
+
if opts["tokenizer_source"]
|
|
92
|
+
return opts["tokenizer_source"]
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# For regular models, use the base model ID
|
|
96
|
+
if opts["base_model"]
|
|
97
|
+
return opts["base_model"]
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Try model_id but strip GGUF parts
|
|
101
|
+
model = opts["model_id"] || (model_id rescue nil)
|
|
102
|
+
return nil unless model
|
|
103
|
+
|
|
104
|
+
# Remove GGUF file suffix if present
|
|
105
|
+
if model.include?("@")
|
|
106
|
+
model = model.split("@").first
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# For GGUF repos, try to guess the tokenizer source
|
|
110
|
+
if model.downcase.include?("gguf")
|
|
111
|
+
guessed = self.class.guess_tokenizer(model)
|
|
112
|
+
return guessed if guessed && guessed != model
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
model
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
public
|
|
45
119
|
|
|
46
120
|
# Generate with regex constraint
|
|
47
121
|
def generate_regex(prompt, pattern:, stop_on_match: true, **options)
|
data/lib/candle/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: red-candle
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.3.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Christopher Petersen
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2025-
|
|
12
|
+
date: 2025-12-10 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: rb_sys
|
|
@@ -151,7 +151,9 @@ dependencies:
|
|
|
151
151
|
- - "~>"
|
|
152
152
|
- !ruby/object:Gem::Version
|
|
153
153
|
version: '3.13'
|
|
154
|
-
description:
|
|
154
|
+
description: Ruby gem for running state-of-the-art language models locally. Access
|
|
155
|
+
LLMs, embeddings, rerankers, and NER models directly from Ruby using Rust-powered
|
|
156
|
+
Candle with Metal/CUDA acceleration.
|
|
155
157
|
email:
|
|
156
158
|
- chris@petersen.io
|
|
157
159
|
- 2xijok@gmail.com
|
|
@@ -204,12 +206,6 @@ files:
|
|
|
204
206
|
- ext/candle/src/structured/vocabulary_adapter_simple_test.rs
|
|
205
207
|
- ext/candle/src/tokenizer/loader.rs
|
|
206
208
|
- ext/candle/src/tokenizer/mod.rs
|
|
207
|
-
- ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt
|
|
208
|
-
- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs
|
|
209
|
-
- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs
|
|
210
|
-
- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs
|
|
211
|
-
- ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs
|
|
212
|
-
- ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs
|
|
213
209
|
- ext/candle/tests/device_tests.rs
|
|
214
210
|
- ext/candle/tests/tensor_tests.rs
|
|
215
211
|
- lib/candle.rb
|
|
@@ -237,16 +233,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
237
233
|
requirements:
|
|
238
234
|
- - ">="
|
|
239
235
|
- !ruby/object:Gem::Version
|
|
240
|
-
version: 3.
|
|
236
|
+
version: 3.1.0
|
|
241
237
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
242
238
|
requirements:
|
|
243
239
|
- - ">="
|
|
244
240
|
- !ruby/object:Gem::Version
|
|
245
|
-
version: 3.3
|
|
241
|
+
version: '3.3'
|
|
246
242
|
requirements:
|
|
247
243
|
- Rust >= 1.85
|
|
248
|
-
rubygems_version: 3.
|
|
244
|
+
rubygems_version: 3.3.3
|
|
249
245
|
signing_key:
|
|
250
246
|
specification_version: 4
|
|
251
|
-
summary:
|
|
247
|
+
summary: Ruby gem for running state-of-the-art language models locally. Access LLMs,
|
|
248
|
+
embeddings, rerankers, and NER models directly from Ruby using Rust-powered Candle
|
|
249
|
+
with Metal/CUDA acceleration.
|
|
252
250
|
test_files: []
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
aarch64-apple-darwin
|