RubyGems - red-candle - Versions diffs - 1.0.2 → 1.1.1 - Mend

red-candle 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/Cargo.lock +244 -6
data/README.md +38 -3
data/Rakefile +46 -1
data/ext/candle/Cargo.toml +2 -0
data/ext/candle/src/lib.rs +2 -0
data/ext/candle/src/llm/constrained_generation_test.rs +316 -0
data/ext/candle/src/llm/gemma.rs +21 -5
data/ext/candle/src/llm/generation_config.rs +11 -0
data/ext/candle/src/llm/llama.rs +21 -5
data/ext/candle/src/llm/mistral.rs +21 -5
data/ext/candle/src/llm/mod.rs +5 -0
data/ext/candle/src/llm/phi.rs +301 -0
data/ext/candle/src/llm/quantized_gguf.rs +173 -9
data/ext/candle/src/llm/qwen.rs +245 -0
data/ext/candle/src/llm/text_generation.rs +183 -26
data/ext/candle/src/ner.rs +25 -51
data/ext/candle/src/reranker.rs +41 -68
data/ext/candle/src/ruby/device.rs +5 -0
data/ext/candle/src/ruby/llm.rs +119 -55
data/ext/candle/src/ruby/mod.rs +1 -0
data/ext/candle/src/ruby/structured.rs +47 -0
data/ext/candle/src/structured/integration_test.rs +130 -0
data/ext/candle/src/structured/mod.rs +31 -0
data/ext/candle/src/structured/schema_processor.rs +215 -0
data/ext/candle/src/structured/vocabulary_adapter.rs +152 -0
data/ext/candle/src/structured/vocabulary_adapter_real_test.rs +66 -0
data/ext/candle/src/structured/vocabulary_adapter_simple_test.rs +70 -0
data/lib/candle/llm.rb +203 -2
data/lib/candle/version.rb +1 -1
metadata +14 -4

data/ext/candle/src/reranker.rs CHANGED Viewed

@@ -4,7 +4,6 @@ use candle_core::{Device as CoreDevice, Tensor, IndexOp, DType};
 use candle_nn::{VarBuilder, Linear, Module, ops::sigmoid};
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use tokenizers::{EncodeInput, Tokenizer};
-use std::thread;
 use crate::ruby::{Device, Result};
 use crate::tokenizer::{TokenizerWrapper, loader::TokenizerLoader};
@@ -24,8 +23,7 @@ impl Reranker {
     }
     fn new_with_core_device(model_id: String, device: CoreDevice) -> std::result::Result<Self, Error> {
-        let device_clone = device.clone();
-        let handle = thread::spawn(move || -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
+        let result = (|| -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
             let api = Api::new()?;
             let repo = api.repo(Repo::new(model_id.clone(), RepoType::Model));
@@ -44,7 +42,7 @@ impl Reranker {
             // Load model weights
             let vb = unsafe {
-                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device_clone)?
+                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device)?
             };
             // Load BERT model
@@ -57,17 +55,49 @@ impl Reranker {
             let classifier = candle_nn::linear(config.hidden_size, 1, vb.pp("classifier"))?;
             Ok((model, TokenizerWrapper::new(tokenizer), pooler, classifier))
-        });
+        })();
-        match handle.join() {
-            Ok(Ok((model, tokenizer, pooler, classifier))) => {
+        match result {
+            Ok((model, tokenizer, pooler, classifier)) => {
                 Ok(Self { model, tokenizer, pooler, classifier, device })
             }
-            Ok(Err(e)) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
-            Err(_) => Err(Error::new(magnus::exception::runtime_error(), "Thread panicked while loading model")),
+            Err(e) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
         }
     }
+    /// Extract CLS embeddings from the model output, handling Metal device workarounds
+    fn extract_cls_embeddings(&self, embeddings: &Tensor) -> std::result::Result<Tensor, Error> {
+        let cls_embeddings = if self.device.is_metal() {
+            // Metal has issues with tensor indexing, use a different approach
+            let (batch_size, seq_len, hidden_size) = embeddings.dims3()
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
+            // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
+            let reshaped = embeddings.reshape((batch_size * seq_len, hidden_size))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
+            // Extract CLS tokens (first token of each sequence)
+            let mut cls_vecs = Vec::new();
+            for i in 0..batch_size {
+                let start_idx = i * seq_len;
+                let cls_vec = reshaped.narrow(0, start_idx, 1)
+                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
+                cls_vecs.push(cls_vec);
+            }
+            // Stack the CLS vectors
+            Tensor::cat(&cls_vecs, 0)
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
+        } else {
+            embeddings.i((.., 0))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
+        };
+        // Ensure tensor is contiguous for downstream operations
+        cls_embeddings.contiguous()
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))
+    }
     pub fn debug_tokenization(&self, query: String, document: String) -> std::result::Result<magnus::RHash, Error> {
         // Create query-document pair for cross-encoder
         let query_doc_pair: EncodeInput = (query.clone(), document.clone()).into();
@@ -131,37 +161,7 @@ impl Reranker {
         let pooled_embeddings = match pooling_method.as_str() {
             "pooler" => {
                 // Extract [CLS] token and apply pooler (dense + tanh)
-                // Work around Metal indexing issue by using narrow instead of i((.., 0))
-                let cls_embeddings = if self.device.is_metal() {
-                    // Metal has issues with tensor indexing, use a different approach
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    // Extract CLS tokens (first token of each sequence)
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    // Stack the CLS vectors
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure tensor is contiguous before linear layer
-                let cls_embeddings = cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make cls_embeddings contiguous: {}", e)))?;
+                let cls_embeddings = self.extract_cls_embeddings(&embeddings)?;
                 let pooled = self.pooler.forward(&cls_embeddings)
                     .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Pooler forward failed: {}", e)))?;
                 pooled.tanh()
@@ -169,34 +169,7 @@ impl Reranker {
             },
             "cls" => {
                 // Just use the [CLS] token embeddings directly (no pooler layer)
-                // Work around Metal indexing issue
-                let cls_embeddings = if self.device.is_metal() {
-                    // Use same approach as pooler method
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure contiguous for classifier
-                cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))?
+                self.extract_cls_embeddings(&embeddings)?
             },
             "mean" => {
                 // Mean pooling across all tokens

data/ext/candle/src/ruby/device.rs CHANGED Viewed

@@ -162,6 +162,10 @@ impl Device {
     pub fn __str__(&self) -> String {
         self.__repr__()
     }
+    pub fn __eq__(&self, other: &Device) -> bool {
+        self == other
+    }
 }
 impl magnus::TryConvert for Device {
@@ -193,5 +197,6 @@ pub fn init(rb_candle: RModule) -> Result<()> {
     rb_device.define_singleton_method("default", function!(default_device, 0))?;
     rb_device.define_method("to_s", method!(Device::__str__, 0))?;
     rb_device.define_method("inspect", method!(Device::__repr__, 0))?;
+    rb_device.define_method("==", method!(Device::__eq__, 1))?;
     Ok(())
 }

data/ext/candle/src/ruby/llm.rs CHANGED Viewed

@@ -1,15 +1,18 @@
 use magnus::{function, method, prelude::*, Error, Module, RArray, RHash, RModule, Ruby, TryConvert, Value};
 use std::cell::RefCell;
+use std::sync::Arc;
-use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral, llama::Llama as RustLlama, gemma::Gemma as RustGemma, QuantizedGGUF as RustQuantizedGGUF};
+use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral, llama::Llama as RustLlama, gemma::Gemma as RustGemma, qwen::Qwen as RustQwen, phi::Phi as RustPhi, QuantizedGGUF as RustQuantizedGGUF};
 use crate::ruby::{Result, Device};
+use crate::ruby::structured::StructuredConstraint;
 // Use an enum to handle different model types instead of trait objects
-#[derive(Debug)]
 enum ModelType {
     Mistral(RustMistral),
     Llama(RustLlama),
     Gemma(RustGemma),
+    Qwen(RustQwen),
+    Phi(RustPhi),
     QuantizedGGUF(RustQuantizedGGUF),
 }
@@ -19,6 +22,8 @@ impl ModelType {
             ModelType::Mistral(m) => m.generate(prompt, config),
             ModelType::Llama(m) => m.generate(prompt, config),
             ModelType::Gemma(m) => m.generate(prompt, config),
+            ModelType::Qwen(m) => m.generate(prompt, config),
+            ModelType::Phi(m) => m.generate(prompt, config),
             ModelType::QuantizedGGUF(m) => m.generate(prompt, config),
         }
     }
@@ -33,6 +38,8 @@ impl ModelType {
             ModelType::Mistral(m) => m.generate_stream(prompt, config, callback),
             ModelType::Llama(m) => m.generate_stream(prompt, config, callback),
             ModelType::Gemma(m) => m.generate_stream(prompt, config, callback),
+            ModelType::Qwen(m) => m.generate_stream(prompt, config, callback),
+            ModelType::Phi(m) => m.generate_stream(prompt, config, callback),
             ModelType::QuantizedGGUF(m) => m.generate_stream(prompt, config, callback),
         }
     }
@@ -42,6 +49,8 @@ impl ModelType {
             ModelType::Mistral(m) => m.clear_cache(),
             ModelType::Llama(m) => m.clear_cache(),
             ModelType::Gemma(m) => m.clear_cache(),
+            ModelType::Qwen(m) => m.clear_cache(),
+            ModelType::Phi(m) => m.clear_cache(),
             ModelType::QuantizedGGUF(m) => m.clear_cache(),
         }
     }
@@ -67,11 +76,33 @@ impl ModelType {
             },
             ModelType::Llama(m) => m.apply_chat_template(messages),
             ModelType::Gemma(m) => m.apply_chat_template(messages),
+            ModelType::Qwen(m) => m.apply_chat_template(messages),
+            ModelType::Phi(m) => m.apply_chat_template(messages),
             ModelType::QuantizedGGUF(m) => m.apply_chat_template(messages),
         }
     }
 }
+// Macro to extract parameters from Ruby hash to reduce boilerplate
+macro_rules! extract_param {
+    // Basic parameter extraction
+    ($kwargs:expr, $config:expr, $param:ident) => {
+        if let Some(value) = $kwargs.get(magnus::Symbol::new(stringify!($param))) {
+            if let Ok(v) = TryConvert::try_convert(value) {
+                $config.$param = v;
+            }
+        }
+    };
+    // Optional parameter extraction (wraps in Some)
+    ($kwargs:expr, $config:expr, $param:ident, optional) => {
+        if let Some(value) = $kwargs.get(magnus::Symbol::new(stringify!($param))) {
+            if let Ok(v) = TryConvert::try_convert(value) {
+                $config.$param = Some(v);
+            }
+        }
+    };
+}
 #[derive(Clone, Debug)]
 #[magnus::wrap(class = "Candle::GenerationConfig", mark, free_immediately)]
 pub struct GenerationConfig {
@@ -82,55 +113,20 @@ impl GenerationConfig {
     pub fn new(kwargs: RHash) -> Result<Self> {
         let mut config = RustGenerationConfig::default();
-        // Extract values from kwargs manually
-        if let Some(value) = kwargs.get(magnus::Symbol::new("max_length")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.max_length = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("temperature")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.temperature = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("top_p")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.top_p = Some(v);
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("top_k")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.top_k = Some(v);
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("repetition_penalty")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.repetition_penalty = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("repetition_penalty_last_n")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.repetition_penalty_last_n = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("seed")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.seed = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("include_prompt")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.include_prompt = v;
-            }
-        }
+        // Extract basic parameters using macro
+        extract_param!(kwargs, config, max_length);
+        extract_param!(kwargs, config, temperature);
+        extract_param!(kwargs, config, top_p, optional);
+        extract_param!(kwargs, config, top_k, optional);
+        extract_param!(kwargs, config, repetition_penalty);
+        extract_param!(kwargs, config, repetition_penalty_last_n);
+        extract_param!(kwargs, config, seed);
+        extract_param!(kwargs, config, include_prompt);
+        extract_param!(kwargs, config, debug_tokens);
+        extract_param!(kwargs, config, stop_on_constraint_satisfaction);
+        extract_param!(kwargs, config, stop_on_match);
+        // Handle special cases that need custom logic
         if let Some(value) = kwargs.get(magnus::Symbol::new("stop_sequences")) {
             if let Ok(arr) = <RArray as TryConvert>::try_convert(value) {
                 config.stop_sequences = arr
@@ -140,9 +136,9 @@ impl GenerationConfig {
             }
         }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("debug_tokens")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.debug_tokens = v;
+        if let Some(value) = kwargs.get(magnus::Symbol::new("constraint")) {
+            if let Ok(constraint) = <&StructuredConstraint as TryConvert>::try_convert(value) {
+                config.constraint = Some(Arc::clone(&constraint.index));
             }
         }
@@ -191,9 +187,23 @@ impl GenerationConfig {
     pub fn debug_tokens(&self) -> bool {
         self.inner.debug_tokens
     }
+    pub fn stop_on_constraint_satisfaction(&self) -> bool {
+        self.inner.stop_on_constraint_satisfaction
+    }
+    pub fn stop_on_match(&self) -> bool {
+        self.inner.stop_on_match
+    }
+    pub fn constraint(&self) -> Option<StructuredConstraint> {
+        self.inner.constraint.as_ref().map(|c| StructuredConstraint {
+            index: Arc::clone(c),
+        })
+    }
 }
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 #[magnus::wrap(class = "Candle::LLM", mark, free_immediately)]
 pub struct LLM {
     model: std::sync::Arc<std::sync::Mutex<RefCell<ModelType>>>,
@@ -251,10 +261,22 @@ impl LLM {
                 })
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Gemma(gemma)
+            } else if model_lower.contains("qwen") {
+                let qwen = rt.block_on(async {
+                    RustQwen::from_pretrained(&model_id, candle_device).await
+                })
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+                ModelType::Qwen(qwen)
+            } else if model_lower.contains("phi") {
+                let phi = rt.block_on(async {
+                    RustPhi::from_pretrained(&model_id, candle_device).await
+                })
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+                ModelType::Phi(phi)
             } else {
                 return Err(Error::new(
                     magnus::exception::runtime_error(),
-                    format!("Unsupported model type: {}. Currently Mistral, Llama, and Gemma models are supported.", model_id),
+                    format!("Unsupported model type: {}. Currently Mistral, Llama, Gemma, Qwen, and Phi models are supported.", model_id),
                 ));
             }
         };
@@ -332,9 +354,47 @@ impl LLM {
             ModelType::Mistral(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
             ModelType::Llama(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
             ModelType::Gemma(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
+            ModelType::Qwen(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
+            ModelType::Phi(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
             ModelType::QuantizedGGUF(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
         }
     }
+    /// Get the EOS token string for this model
+    pub fn eos_token(&self) -> Result<String> {
+        let (eos_token_id, tokenizer_clone) = {
+            let model = match self.model.lock() {
+                Ok(guard) => guard,
+                Err(poisoned) => poisoned.into_inner(),
+            };
+            let model_ref = model.borrow();
+            // Get both EOS token ID and tokenizer clone in one lock scope
+            let eos_id = match &*model_ref {
+                ModelType::Mistral(m) => m.eos_token_id(),
+                ModelType::Llama(m) => m.eos_token_id(),
+                ModelType::Gemma(m) => m.eos_token_id(),
+                ModelType::Qwen(m) => m.eos_token_id(),
+                ModelType::Phi(m) => m.eos_token_id(),
+                ModelType::QuantizedGGUF(m) => m.eos_token_id(),
+            };
+            let tokenizer = match &*model_ref {
+                ModelType::Mistral(m) => m.tokenizer().clone(),
+                ModelType::Llama(m) => m.tokenizer().clone(),
+                ModelType::Gemma(m) => m.tokenizer().clone(),
+                ModelType::Qwen(m) => m.tokenizer().clone(),
+                ModelType::Phi(m) => m.tokenizer().clone(),
+                ModelType::QuantizedGGUF(m) => m.tokenizer().clone(),
+            };
+            (eos_id, tokenizer)
+        }; // Lock is released here
+        // Convert ID to string using the tokenizer
+        let tokenizer_wrapper = crate::ruby::tokenizer::Tokenizer(tokenizer_clone);
+        tokenizer_wrapper.id_to_token(eos_token_id as i64)
+    }
     /// Clear the model's cache (e.g., KV cache for transformers)
     pub fn clear_cache(&self) -> Result<()> {
@@ -423,6 +483,9 @@ pub fn init_llm(rb_candle: RModule) -> Result<()> {
     rb_generation_config.define_method("stop_sequences", method!(GenerationConfig::stop_sequences, 0))?;
     rb_generation_config.define_method("include_prompt", method!(GenerationConfig::include_prompt, 0))?;
     rb_generation_config.define_method("debug_tokens", method!(GenerationConfig::debug_tokens, 0))?;
+    rb_generation_config.define_method("stop_on_constraint_satisfaction", method!(GenerationConfig::stop_on_constraint_satisfaction, 0))?;
+    rb_generation_config.define_method("stop_on_match", method!(GenerationConfig::stop_on_match, 0))?;
+    rb_generation_config.define_method("constraint", method!(GenerationConfig::constraint, 0))?;
     let rb_llm = rb_candle.define_class("LLM", magnus::class::object())?;
     rb_llm.define_singleton_method("_from_pretrained", function!(from_pretrained_wrapper, -1))?;
@@ -431,6 +494,7 @@ pub fn init_llm(rb_candle: RModule) -> Result<()> {
     rb_llm.define_method("model_name", method!(LLM::model_name, 0))?;
     rb_llm.define_method("device", method!(LLM::device, 0))?;
     rb_llm.define_method("tokenizer", method!(LLM::tokenizer, 0))?;
+    rb_llm.define_method("eos_token", method!(LLM::eos_token, 0))?;
     rb_llm.define_method("clear_cache", method!(LLM::clear_cache, 0))?;
     rb_llm.define_method("apply_chat_template", method!(LLM::apply_chat_template, 1))?;

data/ext/candle/src/ruby/mod.rs CHANGED Viewed

@@ -7,6 +7,7 @@ pub mod errors;
 pub mod utils;
 pub mod llm;
 pub mod tokenizer;
+pub mod structured;
 pub use embedding_model::{EmbeddingModel, EmbeddingModelInner};
 pub use tensor::Tensor;

data/ext/candle/src/ruby/structured.rs ADDED Viewed

@@ -0,0 +1,47 @@
+use magnus::{Error, Module, RModule, function, Object};
+use std::sync::Arc;
+use crate::structured::{SchemaProcessor, VocabularyAdapter, Index};
+use crate::ruby::{Result, tokenizer::Tokenizer};
+/// Ruby wrapper for structured generation constraints
+#[derive(Clone, Debug)]
+#[magnus::wrap(class = "Candle::StructuredConstraint", mark, free_immediately)]
+pub struct StructuredConstraint {
+    pub(crate) index: Arc<Index>,
+}
+impl StructuredConstraint {
+    /// Create a constraint from a JSON schema
+    pub fn from_schema(schema: String, tokenizer: &Tokenizer) -> Result<Self> {
+        let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
+        let processor = SchemaProcessor::new();
+        let index = processor.process_schema(&schema, &vocabulary)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process schema: {}", e)))?;
+        Ok(Self { index })
+    }
+    /// Create a constraint from a regex pattern
+    pub fn from_regex(pattern: String, tokenizer: &Tokenizer) -> Result<Self> {
+        let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
+        let processor = SchemaProcessor::new();
+        let index = processor.process_regex(&pattern, &vocabulary)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process regex: {}", e)))?;
+        Ok(Self { index })
+    }
+}
+pub fn init_structured(rb_candle: RModule) -> Result<()> {
+    let class = rb_candle.define_class("StructuredConstraint", magnus::class::object())?;
+    class.define_singleton_method("from_schema", function!(StructuredConstraint::from_schema, 2))?;
+    class.define_singleton_method("from_regex", function!(StructuredConstraint::from_regex, 2))?;
+    Ok(())
+}

data/ext/candle/src/structured/integration_test.rs ADDED Viewed

@@ -0,0 +1,130 @@
+#[cfg(test)]
+mod integration_tests {
+    use super::super::*;
+    use crate::tokenizer::{TokenizerWrapper, loader::TokenizerLoader};
+    use std::sync::Arc;
+    #[tokio::test]
+    async fn test_schema_processor_with_vocabulary() {
+        // This test requires a tokenizer to create a vocabulary
+        let tokenizer_result = TokenizerLoader::from_hf_hub("bert-base-uncased", None).await;
+        if let Ok(tokenizer) = tokenizer_result {
+            let wrapper = TokenizerWrapper::new(tokenizer);
+            // Create vocabulary from tokenizer
+            let vocabulary = VocabularyAdapter::from_tokenizer(&wrapper)
+                .expect("Should create vocabulary");
+            // Create schema processor
+            let processor = SchemaProcessor::new();
+            // Test with a simple JSON schema
+            let schema = r#"{
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string"},
+                    "age": {"type": "integer"}
+                },
+                "required": ["name", "age"]
+            }"#;
+            // Process schema into Index
+            let index_result = processor.process_schema(schema, &vocabulary);
+            assert!(index_result.is_ok(), "Should process schema successfully");
+            // Test caching - second call should use cache
+            let index2_result = processor.process_schema(schema, &vocabulary);
+            assert!(index2_result.is_ok(), "Should retrieve from cache");
+            // Both should be the same Arc
+            let index1 = index_result.unwrap();
+            let index2 = index2_result.unwrap();
+            assert!(Arc::ptr_eq(&index1, &index2), "Should return cached Index");
+            // Check cache stats
+            let (size, _) = processor.cache_stats();
+            assert_eq!(size, 1, "Cache should have one entry");
+        } else {
+            eprintln!("Skipping integration test - couldn't load tokenizer");
+        }
+    }
+    #[tokio::test]
+    async fn test_regex_processing() {
+        let tokenizer_result = TokenizerLoader::from_hf_hub("bert-base-uncased", None).await;
+        if let Ok(tokenizer) = tokenizer_result {
+            let wrapper = TokenizerWrapper::new(tokenizer);
+            let vocabulary = VocabularyAdapter::from_tokenizer(&wrapper)
+                .expect("Should create vocabulary");
+            let processor = SchemaProcessor::new();
+            // Test with a simple regex pattern
+            let email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}";
+            let index_result = processor.process_regex(email_regex, &vocabulary);
+            assert!(index_result.is_ok(), "Should process regex successfully");
+            // Test different regex
+            let phone_regex = r"\d{3}-\d{3}-\d{4}";
+            let phone_index_result = processor.process_regex(phone_regex, &vocabulary);
+            assert!(phone_index_result.is_ok(), "Should process phone regex");
+            // Cache should have both
+            let (size, _) = processor.cache_stats();
+            assert_eq!(size, 2, "Cache should have two entries");
+            // Clear cache
+            processor.clear_cache();
+            let (size, _) = processor.cache_stats();
+            assert_eq!(size, 0, "Cache should be empty after clear");
+        }
+    }
+    #[test]
+    fn test_various_json_schemas() {
+        let _processor = SchemaProcessor::new();
+        // Array schema
+        let array_schema = serde_json::json!({
+            "type": "array",
+            "items": {"type": "string"}
+        });
+        // Process as a full schema instead of testing private method
+        // This would need a mock vocabulary in a real test
+        // For now, just verify the schema is valid JSON
+        let json_str = serde_json::to_string(&array_schema).unwrap();
+        assert!(!json_str.is_empty(), "Should serialize array schema");
+        // Nested object schema
+        let nested_schema = serde_json::json!({
+            "type": "object",
+            "properties": {
+                "user": {
+                    "type": "object",
+                    "properties": {
+                        "id": {"type": "integer"},
+                        "email": {"type": "string", "format": "email"}
+                    }
+                }
+            }
+        });
+        // Verify nested schema is valid
+        let json_str = serde_json::to_string(&nested_schema).unwrap();
+        assert!(json_str.contains("properties"), "Should have nested properties");
+        // Schema with enum
+        let enum_schema = serde_json::json!({
+            "type": "string",
+            "enum": ["red", "green", "blue"]
+        });
+        // Verify enum schema is valid
+        let json_str = serde_json::to_string(&enum_schema).unwrap();
+        assert!(json_str.contains("enum"), "Should have enum values");
+    }
+}

data/ext/candle/src/structured/mod.rs ADDED Viewed

@@ -0,0 +1,31 @@
+/// Structured generation support using Outlines
+///
+/// This module provides functionality to constrain language model generation
+/// to follow specific patterns, such as JSON schemas or regular expressions.
+pub mod vocabulary_adapter;
+pub mod schema_processor;
+pub use vocabulary_adapter::VocabularyAdapter;
+pub use schema_processor::SchemaProcessor;
+// Re-export commonly used types from outlines-core
+pub use outlines_core::prelude::Index;
+pub use outlines_core::vocabulary::Vocabulary;
+#[cfg(test)]
+mod vocabulary_adapter_simple_test;
+#[cfg(test)]
+mod integration_test;
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_module_imports() {
+        // Ensure all exports are available
+        let _ = VocabularyAdapter;
+    }
+}