RubyGems - red-candle - Versions diffs - 1.1.0 → 1.1.1 - Mend

red-candle 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/README.md +2 -1
data/ext/candle/src/llm/constrained_generation_test.rs +199 -6
data/ext/candle/src/llm/gemma.rs +21 -5
data/ext/candle/src/llm/generation_config.rs +6 -0
data/ext/candle/src/llm/llama.rs +21 -5
data/ext/candle/src/llm/mistral.rs +21 -5
data/ext/candle/src/llm/phi.rs +21 -5
data/ext/candle/src/llm/quantized_gguf.rs +18 -5
data/ext/candle/src/llm/qwen.rs +21 -5
data/ext/candle/src/llm/text_generation.rs +121 -28
data/ext/candle/src/ner.rs +25 -51
data/ext/candle/src/reranker.rs +41 -68
data/ext/candle/src/ruby/llm.rs +81 -55
data/lib/candle/llm.rb +129 -34
data/lib/candle/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5a2b4fac15c2c261d8fea90d34973605209043e2b2a222f82414bf94bc5c47e8
-  data.tar.gz: daff4398f34170e20744ab2ee8b1abb5b977ffec15a129d678efced7c9649495
+  metadata.gz: a3678037fbb196c621c8e9df6a213b0d3dffbdb1b8b3dfd73eee4a7ea2feafca
+  data.tar.gz: ada97ef81af854439622bdc12b796442be9e0f31e7c7d8a5df374c7bfb07ff2e
 SHA512:
-  metadata.gz: 4391a7fb4072d9ac174bcecdb366c975c5e487f43fcc4b0db75533aa44c94822d38103a633eba0b098ac26cf31313e9dd7da77255ed83692584e6936cca86271
-  data.tar.gz: da7a15a86fea349069079537b8c6f6696079842d205168aa908644a59528d3ecf5922548d8b3d153494f0beb3fe6221b30110368f1f1289f0ac1ff4856f6b243
+  metadata.gz: d353177318c4599fa30974a676350087a8e5fd070fe3d317344a4e1b3ae022cb69adf742d62063c2da09dbab7e971cbfae1e53a87527ce7f1c18afd1223797e8
+  data.tar.gz: df4b2f43f6fb1aa623053fd09d6e48eba0d8c2615f51dc2accdc4dc292fb3fb7d665553b04cae3747e001e04cd4b9cdbe5c022c3efd077daddf97e074a1e9e5c

data/README.md CHANGED Viewed

@@ -58,7 +58,8 @@ end
 - **EmbeddingModel**: Generate embeddings for text
 - **Reranker**: Rerank documents based on relevance
 - **NER**: Named Entity Recognition directly from Ruby
-- **LLM**: Chat with Large Language Models (e.g., Llama, Mistral, Gemma)
+- **LLM**: Chat with Large Language Models (e.g., Llama, Mistral, Gemma, Qwen, Phi)
+- **Structured Generation**: Generate JSON from a schema or match a regular expression
 ## Model Storage

data/ext/candle/src/llm/constrained_generation_test.rs CHANGED Viewed

@@ -44,8 +44,8 @@ mod constrained_generation_tests {
             let config_without_constraint = GenerationConfig::default();
             // Create text generation instances
-            let mut gen_constrained = TextGeneration::from_config(&config_with_constraint);
-            let mut gen_unconstrained = TextGeneration::from_config(&config_without_constraint);
+            let mut gen_constrained = TextGeneration::new(&config_with_constraint);
+            let mut gen_unconstrained = TextGeneration::new(&config_without_constraint);
             // Set EOS token
             gen_constrained.set_eos_token_id(102); // BERT's [SEP] token
@@ -60,9 +60,9 @@ mod constrained_generation_tests {
     fn test_constraint_configuration() {
         // Test that we can create a TextGeneration with constraints
         let config = GenerationConfig::default();
-        let _text_gen = TextGeneration::from_config(&config);
+        let _text_gen = TextGeneration::new(&config);
-        // Test that we can create a TextGeneration from config
+        // Test that we can create a TextGeneration with config
         // Constraints are private implementation details
     }
@@ -78,7 +78,10 @@ mod constrained_generation_tests {
         let mut logits = Tensor::from_vec(logits_vec.clone(), vocab_size, &device).unwrap();
         // Create text generation with some tokens
-        let mut text_gen = TextGeneration::new(42, Some(1.0), None, None, 1.0, 64);
+        let mut config = GenerationConfig::default();
+        config.seed = 42;
+        config.temperature = 1.0;
+        let mut text_gen = TextGeneration::new(&config);
         text_gen.push_token(0); // Token that had logit 1.0
         text_gen.push_token(2); // Token that had logit 2.0
         text_gen.push_token(5); // Token that had logit 3.0
@@ -100,7 +103,10 @@ mod constrained_generation_tests {
     #[test]
     fn test_stop_conditions() {
-        let mut text_gen = TextGeneration::new(42, Some(1.0), None, None, 1.0, 64);
+        let mut config = GenerationConfig::default();
+        config.seed = 42;
+        config.temperature = 1.0;
+        let mut text_gen = TextGeneration::new(&config);
         text_gen.set_eos_token_id(50256); // Common EOS token
         // Test max length stop
@@ -120,4 +126,191 @@ mod constrained_generation_tests {
         assert!(text_gen.check_stop_sequences("The END", &stop_seqs), "Should detect stop sequence");
         assert!(!text_gen.check_stop_sequences("Continue", &stop_seqs), "Should not detect stop sequence");
     }
+    #[test]
+    fn test_sample_next_token_uses_repetition_penalty() {
+        use candle_core::{Tensor, Device};
+        let device = Device::Cpu;
+        let vocab_size = 10;
+        // Create initial logits
+        let logits_vec: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
+        let logits = Tensor::from_vec(logits_vec.clone(), vocab_size, &device).unwrap();
+        // Test 1: Create TextGeneration with repetition penalty and add some tokens to history
+        let mut config_with_penalty = GenerationConfig::default();
+        config_with_penalty.seed = 42;
+        config_with_penalty.temperature = 0.1;
+        config_with_penalty.repetition_penalty = 1.5;
+        config_with_penalty.repetition_penalty_last_n = 10;
+        let mut text_gen = TextGeneration::new(&config_with_penalty);
+        text_gen.push_token(2); // Token with logit 3.0
+        text_gen.push_token(5); // Token with logit 6.0
+        text_gen.push_token(9); // Token with logit 10.0
+        // Sample with repetition penalty (now uses stored penalty)
+        let _token_with_penalty = text_gen.sample_next_token(&logits).unwrap();
+        // Test 2: Same setup but without penalty
+        let mut config_no_penalty = GenerationConfig::default();
+        config_no_penalty.seed = 42;
+        config_no_penalty.temperature = 0.1;
+        config_no_penalty.repetition_penalty = 1.0; // No penalty
+        let mut text_gen_no_penalty = TextGeneration::new(&config_no_penalty);
+        text_gen_no_penalty.push_token(2);
+        text_gen_no_penalty.push_token(5);
+        text_gen_no_penalty.push_token(9);
+        let _token_without_penalty = text_gen_no_penalty.sample_next_token(&logits).unwrap();
+        // With low temperature and penalty, should avoid previously used high-logit tokens
+        // Without penalty, should prefer high-logit tokens
+        // This is probabilistic, but with temp=0.1 it should be fairly deterministic
+        // Test 3: Verify penalty is applied correctly by checking modified logits
+        let mut config_verify = GenerationConfig::default();
+        config_verify.seed = 42;
+        config_verify.temperature = 0.1;
+        config_verify.repetition_penalty = 2.0;
+        config_verify.repetition_penalty_last_n = 10;
+        let mut text_gen_verify = TextGeneration::new(&config_verify);
+        text_gen_verify.push_token(9); // Highest logit token
+        // Clone logits to check modification
+        let mut logits_for_penalty = logits.clone();
+        text_gen_verify.apply_repetition_penalty(&mut logits_for_penalty, 2.0, 10).unwrap();
+        let penalized = logits_for_penalty.to_vec1::<f32>().unwrap();
+        assert!(penalized[9] < logits_vec[9], "Token 9 should be penalized");
+        assert_eq!(penalized[0], logits_vec[0], "Token 0 should not be penalized");
+    }
+    #[test]
+    fn test_text_generation_from_config_parameters() {
+        // Create a config with specific values
+        let mut config = GenerationConfig::default();
+        config.seed = 12345;
+        config.temperature = 0.5;
+        config.top_p = Some(0.9);
+        config.top_k = Some(40); // Currently unused but should be accepted
+        config.repetition_penalty = 1.2;
+        config.repetition_penalty_last_n = 50;
+        // Create TextGeneration from config
+        let text_gen = TextGeneration::new(&config);
+        // We can't directly inspect private fields, but we can test behavior
+        // Test that it creates successfully (no panic)
+        assert!(text_gen.get_tokens().is_empty(), "Should start with no tokens");
+        // Test with constraint
+        let config_with_constraint = GenerationConfig::default();
+        // In real usage, this would be a real constraint
+        // For testing, we just verify it accepts the config
+        let text_gen_constrained = TextGeneration::new(&config_with_constraint);
+        assert!(text_gen_constrained.get_tokens().is_empty(), "Should start with no tokens");
+    }
+    #[test]
+    fn test_generation_with_different_penalties() {
+        use candle_core::{Tensor, Device, DType};
+        let device = Device::Cpu;
+        let vocab_size = 50;
+        // Create logits with clear preferences
+        let mut logits_vec = vec![0.0; vocab_size];
+        logits_vec[10] = 10.0; // Strong preference
+        logits_vec[20] = 8.0;  // Second preference
+        logits_vec[30] = 6.0;  // Third preference
+        // Test different penalty configurations
+        let configs = vec![
+            (1.0, 64),  // No penalty (1.0 = neutral)
+            (1.5, 64),  // Moderate penalty
+            (2.0, 64),  // Strong penalty
+            (1.2, 10),  // Penalty with limited range
+        ];
+        for (penalty, last_n) in configs {
+            let mut config = GenerationConfig::default();
+            config.seed = 42; // Fixed seed for reproducibility
+            config.temperature = 0.1; // Low temperature for more deterministic behavior
+            config.repetition_penalty = penalty;
+            config.repetition_penalty_last_n = last_n;
+            let mut text_gen = TextGeneration::new(&config);
+            // Generate a sequence of tokens
+            let mut generated = Vec::new();
+            for _i in 0..5 {
+                let logits = Tensor::from_vec(logits_vec.clone(), vocab_size, &device).unwrap().to_dtype(DType::F32).unwrap();
+                let token = text_gen.sample_next_token(&logits).unwrap();
+                generated.push(token);
+                // Verify the token is in valid range
+                assert!(token < vocab_size as u32, "Token should be within vocabulary");
+            }
+            // With higher penalties, we should see more diversity (less repetition)
+            let unique_tokens = generated.iter().collect::<std::collections::HashSet<_>>().len();
+            if penalty > 1.5 {
+                assert!(unique_tokens >= 3, "High penalty should produce diverse tokens");
+            }
+        }
+    }
+    #[test]
+    fn test_sample_next_token_integration() {
+        use candle_core::{Tensor, Device, DType};
+        let device = Device::Cpu;
+        // Test the full integration of sample_next_token
+        let mut config = GenerationConfig::default();
+        config.seed = 999;
+        config.temperature = 0.7;
+        config.max_length = 10;
+        config.repetition_penalty = 1.3;
+        config.repetition_penalty_last_n = 5;
+        let mut text_gen = TextGeneration::new(&config);
+        text_gen.set_eos_token_id(50256);
+        // Simulate a generation loop
+        let vocab_size = 100;
+        let mut all_tokens = Vec::new();
+        for step in 0..8 {
+            // Create varying logits to simulate model output
+            let mut logits_vec = vec![0.0; vocab_size];
+            // Make different tokens attractive at different steps
+            let preferred_token = (step * 13) % vocab_size;
+            logits_vec[preferred_token] = 5.0;
+            logits_vec[(preferred_token + 10) % vocab_size] = 4.0;
+            logits_vec[(preferred_token + 20) % vocab_size] = 3.0;
+            let logits = Tensor::from_vec(logits_vec, vocab_size, &device).unwrap().to_dtype(DType::F32).unwrap();
+            let token = text_gen.sample_next_token(&logits).unwrap();
+            all_tokens.push(token);
+            // Check if we should stop
+            if text_gen.should_stop(token, config.max_length) {
+                break;
+            }
+        }
+        // Verify generation worked
+        assert!(!all_tokens.is_empty(), "Should generate some tokens");
+        assert!(all_tokens.len() <= config.max_length, "Should respect max length");
+        // Verify tokens are being tracked
+        assert_eq!(text_gen.get_tokens().len(), all_tokens.len(), "Internal tokens should match generated");
+    }
 }

data/ext/candle/src/llm/gemma.rs CHANGED Viewed

@@ -16,6 +16,10 @@ pub struct Gemma {
 }
 impl Gemma {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Clear the KV cache between generations
     pub fn clear_kv_cache(&mut self) {
         self.model.clear_kv_cache();
@@ -49,6 +53,9 @@ impl Gemma {
             vec![single_file]
         } else {
             // Try to find sharded model files
+            // NOTE: This uses a brute-force approach, trying common shard counts.
+            // A better approach would be to read model.safetensors.index.json which
+            // contains the exact file list, but this works for most models (≤12 shards).
             let mut sharded_files = Vec::new();
             let mut index = 1;
             loop {
@@ -139,7 +146,7 @@ impl Gemma {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -165,10 +172,7 @@ impl Gemma {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -190,6 +194,18 @@ impl Gemma {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/generation_config.rs CHANGED Viewed

@@ -27,6 +27,10 @@ pub struct GenerationConfig {
     pub debug_tokens: bool,
     /// Optional constraint index for structured generation
     pub constraint: Option<Arc<Index>>,
+    /// Stop immediately when constraint is satisfied
+    pub stop_on_constraint_satisfaction: bool,
+    /// Whether to stop immediately when pattern is matched (vs allowing continuation)
+    pub stop_on_match: bool,
 }
 /// Generate a random seed based on current time
@@ -51,6 +55,8 @@ impl Default for GenerationConfig {
             include_prompt: false,
             debug_tokens: false,
             constraint: None,
+            stop_on_constraint_satisfaction: true,
+            stop_on_match: true,
         }
     }
 }

data/ext/candle/src/llm/llama.rs CHANGED Viewed

@@ -18,6 +18,10 @@ pub struct Llama {
 }
 impl Llama {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Clear the KV cache between generations
     pub fn clear_kv_cache(&mut self) {
         // Since Cache doesn't expose a reset method and kvs is private,
@@ -58,6 +62,9 @@ impl Llama {
             vec![consolidated_file]
         } else {
             // Try to find sharded model files
+            // NOTE: This uses a brute-force approach, trying common shard counts.
+            // A better approach would be to read model.safetensors.index.json which
+            // contains the exact file list, but this works for most models (≤30 shards).
             let mut sharded_files = Vec::new();
             let mut index = 1;
             loop {
@@ -175,7 +182,7 @@ impl Llama {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -201,10 +208,7 @@ impl Llama {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -226,6 +230,18 @@ impl Llama {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/mistral.rs CHANGED Viewed

@@ -16,6 +16,10 @@ pub struct Mistral {
 }
 impl Mistral {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Clear the KV cache between generations
     pub fn clear_kv_cache(&mut self) {
         self.model.clear_kv_cache();
@@ -52,6 +56,9 @@ impl Mistral {
             vec![consolidated_file]
         } else {
             // Try to find sharded model files
+            // NOTE: This uses a brute-force approach, trying common shard counts.
+            // A better approach would be to read model.safetensors.index.json which
+            // contains the exact file list, but this works for most models (≤8 shards).
             let mut sharded_files = Vec::new();
             let mut index = 1;
             loop {
@@ -144,7 +151,7 @@ impl Mistral {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -176,10 +183,7 @@ impl Mistral {
             // Convert to F32 for sampling if needed
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -201,6 +205,18 @@ impl Mistral {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/phi.rs CHANGED Viewed

@@ -21,6 +21,10 @@ enum PhiVariant {
 }
 impl Phi {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Get the tokenizer
     pub fn tokenizer(&self) -> &TokenizerWrapper {
         &self.tokenizer
@@ -68,6 +72,9 @@ impl Phi {
             vec![single_file]
         } else {
             // Try to find sharded model files
+            // NOTE: This uses a brute-force approach, trying common shard counts.
+            // A better approach would be to read model.safetensors.index.json which
+            // contains the exact file list, but this works for most models (≤30 shards).
             let mut sharded_files = Vec::new();
             let mut index = 1;
             loop {
@@ -177,7 +184,7 @@ impl Phi {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -206,10 +213,7 @@ impl Phi {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -229,6 +233,18 @@ impl Phi {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/quantized_gguf.rs CHANGED Viewed

@@ -32,6 +32,10 @@ enum ModelType {
 }
 impl QuantizedGGUF {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Get the tokenizer
     pub fn tokenizer(&self) -> &TokenizerWrapper {
         &self.tokenizer
@@ -538,7 +542,7 @@ impl QuantizedGGUF {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -571,10 +575,7 @@ impl QuantizedGGUF {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -596,6 +597,18 @@ impl QuantizedGGUF {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/qwen.rs CHANGED Viewed

@@ -16,6 +16,10 @@ pub struct Qwen {
 }
 impl Qwen {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Get the tokenizer
     pub fn tokenizer(&self) -> &TokenizerWrapper {
         &self.tokenizer
@@ -55,6 +59,9 @@ impl Qwen {
             .unwrap_or(151643); // Default Qwen3 EOS token
         // Download model weights
+        // NOTE: Qwen uses hardcoded shard counts based on model size rather than
+        // reading model.safetensors.index.json. This works for official Qwen models
+        // but may fail for custom configurations with different shard counts.
         let mut filenames = vec![];
         let num_shards = if model_id.contains("72b") || model_id.contains("72B") { 8 }
                         else if model_id.contains("14b") || model_id.contains("14B") { 3 }
@@ -124,7 +131,7 @@ impl Qwen {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -150,10 +157,7 @@ impl Qwen {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -173,6 +177,18 @@ impl Qwen {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {