RubyGems - red-candle - Versions diffs - 1.1.0 → 1.1.1 - Mend

red-candle 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/README.md +2 -1
data/ext/candle/src/llm/constrained_generation_test.rs +199 -6
data/ext/candle/src/llm/gemma.rs +21 -5
data/ext/candle/src/llm/generation_config.rs +6 -0
data/ext/candle/src/llm/llama.rs +21 -5
data/ext/candle/src/llm/mistral.rs +21 -5
data/ext/candle/src/llm/phi.rs +21 -5
data/ext/candle/src/llm/quantized_gguf.rs +18 -5
data/ext/candle/src/llm/qwen.rs +21 -5
data/ext/candle/src/llm/text_generation.rs +121 -28
data/ext/candle/src/ner.rs +25 -51
data/ext/candle/src/reranker.rs +41 -68
data/ext/candle/src/ruby/llm.rs +81 -55
data/lib/candle/llm.rb +129 -34
data/lib/candle/version.rb +1 -1
metadata +2 -2

data/ext/candle/src/llm/text_generation.rs CHANGED Viewed

@@ -10,39 +10,29 @@ pub struct TextGeneration {
     logits_processor: LogitsProcessor,
     tokens: Vec<u32>,
     eos_token_id: Option<u32>,
+    repetition_penalty: f32,
+    repetition_penalty_last_n: usize,
     constraint: Option<Arc<Index>>,
     constraint_state: Option<u32>,
+    constraint_completed: bool,
+    tokens_since_constraint_start: usize,
 }
 impl TextGeneration {
-    pub fn new(
-        seed: u64,
-        temperature: Option<f64>,
-        top_p: Option<f64>,
-        _top_k: Option<usize>,
-        _repetition_penalty: f32,
-        _repetition_penalty_last_n: usize,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temperature, top_p);
-        Self {
+    pub fn new(config: &GenerationConfig) -> Self {
+        let logits_processor = LogitsProcessor::new(config.seed, Some(config.temperature), config.top_p);
+        let mut text_gen = Self {
             logits_processor,
             tokens: Vec::new(),
             eos_token_id: None,
+            repetition_penalty: config.repetition_penalty,
+            repetition_penalty_last_n: config.repetition_penalty_last_n,
             constraint: None,
             constraint_state: None,
-        }
-    }
-    pub fn from_config(config: &GenerationConfig) -> Self {
-        let mut text_gen = Self::new(
-            config.seed,
-            Some(config.temperature),
-            config.top_p,
-            config.top_k,
-            config.repetition_penalty,
-            config.repetition_penalty_last_n,
-        );
+            constraint_completed: false,
+            tokens_since_constraint_start: 0,
+        };
         // Set constraint if provided
         if let Some(ref constraint) = config.constraint {
@@ -72,6 +62,8 @@ impl TextGeneration {
         // Initialize with the first state
         self.constraint_state = Some(constraint.initial_state());
         self.constraint = Some(constraint);
+        self.constraint_completed = false;
+        self.tokens_since_constraint_start = self.tokens.len();
     }
     /// Apply constraints to logits by masking disallowed tokens
@@ -137,13 +129,12 @@ impl TextGeneration {
     pub fn sample_next_token(
         &mut self,
         logits: &Tensor,
-        repetition_penalty: Option<(f32, usize)>,
     ) -> CandleResult<u32> {
         let mut logits = logits.clone();
-        // Apply repetition penalty if specified
-        if let Some((penalty, last_n)) = repetition_penalty {
-            self.apply_repetition_penalty(&mut logits, penalty, last_n)?;
+        // Apply repetition penalty using stored parameters
+        if self.repetition_penalty != 1.0 {
+            self.apply_repetition_penalty(&mut logits, self.repetition_penalty, self.repetition_penalty_last_n)?;
         }
         // Apply constraints if active
@@ -155,12 +146,114 @@ impl TextGeneration {
         // Update constraint state if active
         if let (Some(ref constraint_index), Some(current_state)) = (&self.constraint, self.constraint_state) {
-            self.constraint_state = constraint_index.next_state(&current_state, &next_token);
+            // Get the next state
+            let next_state = constraint_index.next_state(&current_state, &next_token);
+            // Check if we're transitioning to a state with no allowed tokens (completion)
+            if !self.constraint_completed && self.tokens.len() > self.tokens_since_constraint_start {
+                // Check if we've transitioned from a constrained state to an unconstrained state
+                // This happens when the pattern is complete and the FSM allows "anything"
+                let current_constrained = if let Some(allowed) = constraint_index.allowed_tokens(&current_state) {
+                    // Consider it constrained if we have a limited set of allowed tokens
+                    allowed.len() < 1000  // Arbitrary threshold for "constrained"
+                } else {
+                    true  // No tokens allowed is definitely constrained
+                };
+                let next_constrained = if let Some(next_state_val) = next_state {
+                    if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
+                        allowed.is_empty() || allowed.len() < 1000
+                    } else {
+                        true
+                    }
+                } else {
+                    true
+                };
+                // If we're transitioning from constrained to unconstrained, we've completed the pattern
+                if current_constrained && !next_constrained {
+                    self.constraint_completed = true;
+                }
+                // Also check if next state has no allowed tokens at all
+                if let Some(next_state_val) = next_state {
+                    if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
+                        if allowed.is_empty() {
+                            self.constraint_completed = true;
+                        }
+                    } else {
+                        // None means no tokens allowed - constraint is complete
+                        self.constraint_completed = true;
+                    }
+                }
+            }
+            self.constraint_state = next_state;
         }
         Ok(next_token)
     }
+    /// Check if the constraint is satisfied (reached a valid completion state)
+    pub fn is_constraint_satisfied(&self) -> bool {
+        // If we've explicitly marked the constraint as completed, return true
+        if self.constraint_completed {
+            return true;
+        }
+        // Also check the current state
+        if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
+            // Check if the constraint has reached a state where it could validly end
+            // This happens when:
+            // 1. We have no more allowed tokens (constraint fully satisfied)
+            // 2. The EOS token is in the allowed tokens (optional ending)
+            if let Some(allowed) = constraint_index.allowed_tokens(&state) {
+                // If no tokens are allowed, the constraint is fully satisfied
+                if allowed.is_empty() {
+                    return true;
+                }
+                // If EOS token is allowed, we've reached an optional completion point
+                if let Some(eos) = self.eos_token_id {
+                    if allowed.contains(&eos) {
+                        return true;
+                    }
+                }
+            } else {
+                // None means no tokens allowed - constraint is satisfied
+                return true;
+            }
+        }
+        false
+    }
+    /// Check if the constraint is satisfied when stop_on_match is true
+    pub fn is_constraint_satisfied_stop_on_match(&self) -> bool {
+        // When stop_on_match is true, we stop as soon as the constraint is completed
+        if self.constraint_completed {
+            return true;
+        }
+        // Also check if we're currently in a state that could be a valid end
+        // This is important for patterns like phone numbers where after matching
+        // the pattern, the FSM might allow any token (including more numbers)
+        if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
+            // Check if we've generated at least one token since constraint start
+            if self.tokens.len() > self.tokens_since_constraint_start {
+                if let Some(allowed) = constraint_index.allowed_tokens(&state) {
+                    // If the allowed tokens set is very large (unconstrained),
+                    // it means the pattern has been satisfied
+                    if allowed.len() > 1000 {
+                        return true;
+                    }
+                }
+            }
+        }
+        false
+    }
     /// Check if we should stop generation
     pub fn should_stop(&self, token: u32, max_length: usize) -> bool {
         if self.tokens.len() >= max_length {

data/ext/candle/src/ner.rs CHANGED Viewed

@@ -39,13 +39,9 @@ impl NER {
     pub fn new(model_id: String, device: Option<Device>, tokenizer_id: Option<String>) -> Result<Self> {
         let device = device.unwrap_or(Device::Cpu).as_device()?;
-        // Load model in a separate thread to avoid blocking
-        let device_clone = device.clone();
-        let model_id_clone = model_id.clone();
-        let handle = std::thread::spawn(move || -> std::result::Result<(BertModel, TokenizerWrapper, Linear, NERConfig), Box<dyn std::error::Error + Send + Sync>> {
+        let result = (|| -> std::result::Result<(BertModel, TokenizerWrapper, Linear, NERConfig), Box<dyn std::error::Error + Send + Sync>> {
             let api = Api::new()?;
-            let repo = api.repo(Repo::new(model_id_clone.clone(), RepoType::Model));
+            let repo = api.repo(Repo::new(model_id.clone(), RepoType::Model));
             // Download model files
             let config_filename = repo.get("config.json")?;
@@ -92,7 +88,7 @@ impl NER {
             // Load model weights
             let vb = unsafe {
-                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device_clone)?
+                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device)?
             };
             // Load BERT model
@@ -106,10 +102,10 @@ impl NER {
             )?;
             Ok((model, TokenizerWrapper::new(tokenizer), classifier, ner_config))
-        });
+        })();
-        match handle.join() {
-            Ok(Ok((model, tokenizer, classifier, config))) => {
+        match result {
+            Ok((model, tokenizer, classifier, config)) => {
                 Ok(Self {
                     model,
                     tokenizer,
@@ -119,28 +115,20 @@ impl NER {
                     model_id,
                 })
             }
-            Ok(Err(e)) => Err(Error::new(
+            Err(e) => Err(Error::new(
                 magnus::exception::runtime_error(),
                 format!("Failed to load NER model: {}", e)
             )),
-            Err(_) => Err(Error::new(
-                magnus::exception::runtime_error(),
-                "Thread panicked while loading NER model"
-            )),
         }
     }
-    /// Extract entities from text with confidence scores
-    pub fn extract_entities(&self, text: String, confidence_threshold: Option<f64>) -> Result<RArray> {
-        let threshold = confidence_threshold.unwrap_or(0.9) as f32;
+    /// Common tokenization and prediction logic
+    fn tokenize_and_predict(&self, text: &str) -> Result<(tokenizers::Encoding, Vec<Vec<f32>>)> {
         // Tokenize the text
-        let encoding = self.tokenizer.inner().encode(text.as_str(), true)
+        let encoding = self.tokenizer.inner().encode(text, true)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Tokenization failed: {}", e)))?;
         let token_ids = encoding.get_ids();
-        let tokens = encoding.get_tokens();
-        let offsets = encoding.get_offsets();
         // Convert to tensors
         let input_ids = Tensor::new(token_ids, &self.device)
@@ -171,6 +159,19 @@ impl NER {
             .to_vec2()
             .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
+        Ok((encoding, probs_vec))
+    }
+    /// Extract entities from text with confidence scores
+    pub fn extract_entities(&self, text: String, confidence_threshold: Option<f64>) -> Result<RArray> {
+        let threshold = confidence_threshold.unwrap_or(0.9) as f32;
+        // Use common tokenization and prediction logic
+        let (encoding, probs_vec) = self.tokenize_and_predict(&text)?;
+        let tokens = encoding.get_tokens();
+        let offsets = encoding.get_offsets();
         // Extract entities with BIO decoding
         let entities = self.decode_entities(
             &text,
@@ -199,38 +200,11 @@ impl NER {
     /// Get token-level predictions with labels and confidence scores
     pub fn predict_tokens(&self, text: String) -> Result<RArray> {
-        // Tokenize the text
-        let encoding = self.tokenizer.inner().encode(text.as_str(), true)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Tokenization failed: {}", e)))?;
+        // Use common tokenization and prediction logic
+        let (encoding, probs_vec) = self.tokenize_and_predict(&text)?;
-        let token_ids = encoding.get_ids();
         let tokens = encoding.get_tokens();
-        // Convert to tensors
-        let input_ids = Tensor::new(token_ids, &self.device)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?
-            .unsqueeze(0)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let attention_mask = Tensor::ones_like(&input_ids)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let token_type_ids = Tensor::zeros_like(&input_ids)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        // Forward pass
-        let output = self.model.forward(&input_ids, &token_type_ids, Some(&attention_mask))
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let logits = self.classifier.forward(&output)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let probs = candle_nn::ops::softmax(&logits, 2)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        // Get predictions
-        let probs_vec: Vec<Vec<f32>> = probs.squeeze(0)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?
-            .to_vec2()
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
         // Build result array
         let result = RArray::new();
         for (i, (token, probs)) in tokens.iter().zip(probs_vec.iter()).enumerate() {

data/ext/candle/src/reranker.rs CHANGED Viewed

@@ -4,7 +4,6 @@ use candle_core::{Device as CoreDevice, Tensor, IndexOp, DType};
 use candle_nn::{VarBuilder, Linear, Module, ops::sigmoid};
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use tokenizers::{EncodeInput, Tokenizer};
-use std::thread;
 use crate::ruby::{Device, Result};
 use crate::tokenizer::{TokenizerWrapper, loader::TokenizerLoader};
@@ -24,8 +23,7 @@ impl Reranker {
     }
     fn new_with_core_device(model_id: String, device: CoreDevice) -> std::result::Result<Self, Error> {
-        let device_clone = device.clone();
-        let handle = thread::spawn(move || -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
+        let result = (|| -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
             let api = Api::new()?;
             let repo = api.repo(Repo::new(model_id.clone(), RepoType::Model));
@@ -44,7 +42,7 @@ impl Reranker {
             // Load model weights
             let vb = unsafe {
-                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device_clone)?
+                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device)?
             };
             // Load BERT model
@@ -57,17 +55,49 @@ impl Reranker {
             let classifier = candle_nn::linear(config.hidden_size, 1, vb.pp("classifier"))?;
             Ok((model, TokenizerWrapper::new(tokenizer), pooler, classifier))
-        });
+        })();
-        match handle.join() {
-            Ok(Ok((model, tokenizer, pooler, classifier))) => {
+        match result {
+            Ok((model, tokenizer, pooler, classifier)) => {
                 Ok(Self { model, tokenizer, pooler, classifier, device })
             }
-            Ok(Err(e)) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
-            Err(_) => Err(Error::new(magnus::exception::runtime_error(), "Thread panicked while loading model")),
+            Err(e) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
         }
     }
+    /// Extract CLS embeddings from the model output, handling Metal device workarounds
+    fn extract_cls_embeddings(&self, embeddings: &Tensor) -> std::result::Result<Tensor, Error> {
+        let cls_embeddings = if self.device.is_metal() {
+            // Metal has issues with tensor indexing, use a different approach
+            let (batch_size, seq_len, hidden_size) = embeddings.dims3()
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
+            // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
+            let reshaped = embeddings.reshape((batch_size * seq_len, hidden_size))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
+            // Extract CLS tokens (first token of each sequence)
+            let mut cls_vecs = Vec::new();
+            for i in 0..batch_size {
+                let start_idx = i * seq_len;
+                let cls_vec = reshaped.narrow(0, start_idx, 1)
+                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
+                cls_vecs.push(cls_vec);
+            }
+            // Stack the CLS vectors
+            Tensor::cat(&cls_vecs, 0)
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
+        } else {
+            embeddings.i((.., 0))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
+        };
+        // Ensure tensor is contiguous for downstream operations
+        cls_embeddings.contiguous()
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))
+    }
     pub fn debug_tokenization(&self, query: String, document: String) -> std::result::Result<magnus::RHash, Error> {
         // Create query-document pair for cross-encoder
         let query_doc_pair: EncodeInput = (query.clone(), document.clone()).into();
@@ -131,37 +161,7 @@ impl Reranker {
         let pooled_embeddings = match pooling_method.as_str() {
             "pooler" => {
                 // Extract [CLS] token and apply pooler (dense + tanh)
-                // Work around Metal indexing issue by using narrow instead of i((.., 0))
-                let cls_embeddings = if self.device.is_metal() {
-                    // Metal has issues with tensor indexing, use a different approach
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    // Extract CLS tokens (first token of each sequence)
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    // Stack the CLS vectors
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure tensor is contiguous before linear layer
-                let cls_embeddings = cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make cls_embeddings contiguous: {}", e)))?;
+                let cls_embeddings = self.extract_cls_embeddings(&embeddings)?;
                 let pooled = self.pooler.forward(&cls_embeddings)
                     .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Pooler forward failed: {}", e)))?;
                 pooled.tanh()
@@ -169,34 +169,7 @@ impl Reranker {
             },
             "cls" => {
                 // Just use the [CLS] token embeddings directly (no pooler layer)
-                // Work around Metal indexing issue
-                let cls_embeddings = if self.device.is_metal() {
-                    // Use same approach as pooler method
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure contiguous for classifier
-                cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))?
+                self.extract_cls_embeddings(&embeddings)?
             },
             "mean" => {
                 // Mean pooling across all tokens

data/ext/candle/src/ruby/llm.rs CHANGED Viewed

@@ -83,6 +83,26 @@ impl ModelType {
     }
 }
+// Macro to extract parameters from Ruby hash to reduce boilerplate
+macro_rules! extract_param {
+    // Basic parameter extraction
+    ($kwargs:expr, $config:expr, $param:ident) => {
+        if let Some(value) = $kwargs.get(magnus::Symbol::new(stringify!($param))) {
+            if let Ok(v) = TryConvert::try_convert(value) {
+                $config.$param = v;
+            }
+        }
+    };
+    // Optional parameter extraction (wraps in Some)
+    ($kwargs:expr, $config:expr, $param:ident, optional) => {
+        if let Some(value) = $kwargs.get(magnus::Symbol::new(stringify!($param))) {
+            if let Ok(v) = TryConvert::try_convert(value) {
+                $config.$param = Some(v);
+            }
+        }
+    };
+}
 #[derive(Clone, Debug)]
 #[magnus::wrap(class = "Candle::GenerationConfig", mark, free_immediately)]
 pub struct GenerationConfig {
@@ -93,55 +113,20 @@ impl GenerationConfig {
     pub fn new(kwargs: RHash) -> Result<Self> {
         let mut config = RustGenerationConfig::default();
-        // Extract values from kwargs manually
-        if let Some(value) = kwargs.get(magnus::Symbol::new("max_length")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.max_length = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("temperature")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.temperature = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("top_p")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.top_p = Some(v);
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("top_k")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.top_k = Some(v);
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("repetition_penalty")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.repetition_penalty = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("repetition_penalty_last_n")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.repetition_penalty_last_n = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("seed")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.seed = v;
-            }
-        }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("include_prompt")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.include_prompt = v;
-            }
-        }
+        // Extract basic parameters using macro
+        extract_param!(kwargs, config, max_length);
+        extract_param!(kwargs, config, temperature);
+        extract_param!(kwargs, config, top_p, optional);
+        extract_param!(kwargs, config, top_k, optional);
+        extract_param!(kwargs, config, repetition_penalty);
+        extract_param!(kwargs, config, repetition_penalty_last_n);
+        extract_param!(kwargs, config, seed);
+        extract_param!(kwargs, config, include_prompt);
+        extract_param!(kwargs, config, debug_tokens);
+        extract_param!(kwargs, config, stop_on_constraint_satisfaction);
+        extract_param!(kwargs, config, stop_on_match);
+        // Handle special cases that need custom logic
         if let Some(value) = kwargs.get(magnus::Symbol::new("stop_sequences")) {
             if let Ok(arr) = <RArray as TryConvert>::try_convert(value) {
                 config.stop_sequences = arr
@@ -151,13 +136,6 @@ impl GenerationConfig {
             }
         }
-        if let Some(value) = kwargs.get(magnus::Symbol::new("debug_tokens")) {
-            if let Ok(v) = TryConvert::try_convert(value) {
-                config.debug_tokens = v;
-            }
-        }
-        // Handle constraint parameter
         if let Some(value) = kwargs.get(magnus::Symbol::new("constraint")) {
             if let Ok(constraint) = <&StructuredConstraint as TryConvert>::try_convert(value) {
                 config.constraint = Some(Arc::clone(&constraint.index));
@@ -209,6 +187,15 @@ impl GenerationConfig {
     pub fn debug_tokens(&self) -> bool {
         self.inner.debug_tokens
     }
+    pub fn stop_on_constraint_satisfaction(&self) -> bool {
+        self.inner.stop_on_constraint_satisfaction
+    }
+    pub fn stop_on_match(&self) -> bool {
+        self.inner.stop_on_match
+    }
     pub fn constraint(&self) -> Option<StructuredConstraint> {
         self.inner.constraint.as_ref().map(|c| StructuredConstraint {
             index: Arc::clone(c),
@@ -372,6 +359,42 @@ impl LLM {
             ModelType::QuantizedGGUF(m) => Ok(crate::ruby::tokenizer::Tokenizer(m.tokenizer().clone())),
         }
     }
+    /// Get the EOS token string for this model
+    pub fn eos_token(&self) -> Result<String> {
+        let (eos_token_id, tokenizer_clone) = {
+            let model = match self.model.lock() {
+                Ok(guard) => guard,
+                Err(poisoned) => poisoned.into_inner(),
+            };
+            let model_ref = model.borrow();
+            // Get both EOS token ID and tokenizer clone in one lock scope
+            let eos_id = match &*model_ref {
+                ModelType::Mistral(m) => m.eos_token_id(),
+                ModelType::Llama(m) => m.eos_token_id(),
+                ModelType::Gemma(m) => m.eos_token_id(),
+                ModelType::Qwen(m) => m.eos_token_id(),
+                ModelType::Phi(m) => m.eos_token_id(),
+                ModelType::QuantizedGGUF(m) => m.eos_token_id(),
+            };
+            let tokenizer = match &*model_ref {
+                ModelType::Mistral(m) => m.tokenizer().clone(),
+                ModelType::Llama(m) => m.tokenizer().clone(),
+                ModelType::Gemma(m) => m.tokenizer().clone(),
+                ModelType::Qwen(m) => m.tokenizer().clone(),
+                ModelType::Phi(m) => m.tokenizer().clone(),
+                ModelType::QuantizedGGUF(m) => m.tokenizer().clone(),
+            };
+            (eos_id, tokenizer)
+        }; // Lock is released here
+        // Convert ID to string using the tokenizer
+        let tokenizer_wrapper = crate::ruby::tokenizer::Tokenizer(tokenizer_clone);
+        tokenizer_wrapper.id_to_token(eos_token_id as i64)
+    }
     /// Clear the model's cache (e.g., KV cache for transformers)
     pub fn clear_cache(&self) -> Result<()> {
@@ -460,6 +483,8 @@ pub fn init_llm(rb_candle: RModule) -> Result<()> {
     rb_generation_config.define_method("stop_sequences", method!(GenerationConfig::stop_sequences, 0))?;
     rb_generation_config.define_method("include_prompt", method!(GenerationConfig::include_prompt, 0))?;
     rb_generation_config.define_method("debug_tokens", method!(GenerationConfig::debug_tokens, 0))?;
+    rb_generation_config.define_method("stop_on_constraint_satisfaction", method!(GenerationConfig::stop_on_constraint_satisfaction, 0))?;
+    rb_generation_config.define_method("stop_on_match", method!(GenerationConfig::stop_on_match, 0))?;
     rb_generation_config.define_method("constraint", method!(GenerationConfig::constraint, 0))?;
     let rb_llm = rb_candle.define_class("LLM", magnus::class::object())?;
@@ -469,6 +494,7 @@ pub fn init_llm(rb_candle: RModule) -> Result<()> {
     rb_llm.define_method("model_name", method!(LLM::model_name, 0))?;
     rb_llm.define_method("device", method!(LLM::device, 0))?;
     rb_llm.define_method("tokenizer", method!(LLM::tokenizer, 0))?;
+    rb_llm.define_method("eos_token", method!(LLM::eos_token, 0))?;
     rb_llm.define_method("clear_cache", method!(LLM::clear_cache, 0))?;
     rb_llm.define_method("apply_chat_template", method!(LLM::apply_chat_template, 1))?;