RubyGems - red-candle - Versions diffs - 1.2.3 → 1.3.1 - Mend

red-candle 1.2.3 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/Cargo.lock +460 -379
data/README.md +1 -1
data/ext/candle/Cargo.toml +3 -3
data/ext/candle/src/llm/constrained_generation_test.rs +79 -0
data/ext/candle/src/llm/gemma.rs +24 -9
data/ext/candle/src/llm/llama.rs +46 -10
data/ext/candle/src/llm/mistral.rs +46 -10
data/ext/candle/src/llm/phi.rs +76 -8
data/ext/candle/src/llm/qwen.rs +23 -10
data/ext/candle/src/llm/text_generation.rs +40 -50
data/ext/candle/src/ruby/llm.rs +62 -29
data/ext/candle/src/ruby/structured.rs +54 -10
data/lib/candle/llm.rb +77 -3
data/lib/candle/version.rb +1 -1
metadata +11 -13
data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt +0 -1
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs +0 -355
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs +0 -276
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs +0 -49
data/ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs +0 -2748
data/ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs +0 -8902

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-<img src="/docs/assets/logo-title.png" alt="red-candle" height="80px">
+<img src="/docs/assets/logo-title.png" alt="red-candle" height="160px">
 [![build](https://github.com/scientist-labs/red-candle/actions/workflows/build.yml/badge.svg)](https://github.com/scientist-labs/red-candle/actions/workflows/build.yml)
 [![Gem Version](https://badge.fury.io/rb/red-candle.svg)](https://badge.fury.io/rb/red-candle)

data/ext/candle/Cargo.toml CHANGED Viewed

@@ -12,8 +12,8 @@ crate-type = ["cdylib"]
 candle-core = { version = "0.9.1" }
 candle-nn = { version = "0.9.1" }
 candle-transformers = { version = "0.9.1" }
-tokenizers = { version = "0.21.1", default-features = true, features = ["fancy-regex"] }
-hf-hub = "0.4.3"
+tokenizers = { version = "0.22.0", default-features = true, features = ["fancy-regex"] }
+hf-hub = "0.4.1"
 half = "2.6.0"
 magnus = "0.7.1"
 safetensors = "0.3"
@@ -21,7 +21,7 @@ serde_json = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.45", features = ["rt", "macros"] }
 rand = "0.8"
-outlines-core = "0.2"
+outlines-core = "0.2.11"
 [features]
 default = []

data/ext/candle/src/llm/constrained_generation_test.rs CHANGED Viewed

@@ -313,4 +313,83 @@ mod constrained_generation_tests {
         // Verify tokens are being tracked
         assert_eq!(text_gen.get_tokens().len(), all_tokens.len(), "Internal tokens should match generated");
     }
+    #[test]
+    fn test_constraint_satisfied_not_triggered_by_large_allowed_set() {
+        // This test verifies the fix for the bug where is_constraint_satisfied_stop_on_match
+        // would incorrectly return true when many tokens are allowed (e.g., inside a JSON string).
+        // The old buggy code had: if allowed.len() > 1000 { return true; }
+        // This caused early termination when inside strings with many valid characters.
+        let config = GenerationConfig::default();
+        let mut text_gen = TextGeneration::new(&config);
+        text_gen.set_eos_token_id(50256);
+        // Without a constraint, should not be satisfied
+        assert!(!text_gen.is_constraint_satisfied(),
+            "Without constraint, should not be satisfied");
+        assert!(!text_gen.is_constraint_satisfied_stop_on_match(),
+            "Without constraint, stop_on_match should not be satisfied");
+    }
+    #[test]
+    fn test_constraint_satisfied_only_when_empty_or_eos_only() {
+        // Test that constraint satisfaction only triggers when:
+        // 1. No tokens are allowed (empty set)
+        // 2. Only EOS token is allowed
+        // NOT when many tokens are allowed (like inside a JSON string)
+        let config = GenerationConfig::default();
+        let mut text_gen = TextGeneration::new(&config);
+        text_gen.set_eos_token_id(100); // Set EOS token
+        // Without constraint, should not be satisfied
+        assert!(!text_gen.is_constraint_satisfied());
+        assert!(!text_gen.is_constraint_satisfied_stop_on_match());
+        // The key insight: constraint satisfaction should NOT be triggered
+        // just because there are many allowed tokens. It should only trigger
+        // when the constraint is definitively complete (empty allowed set or only EOS).
+    }
+    #[tokio::test]
+    async fn test_constraint_with_json_schema_not_early_termination() {
+        // Integration test: Create a real JSON schema constraint and verify
+        // that being inside a string (many allowed tokens) doesn't trigger completion.
+        if let Ok(tokenizer) = TokenizerLoader::from_hf_hub("bert-base-uncased", None).await {
+            let wrapper = TokenizerWrapper::new(tokenizer);
+            let vocabulary = VocabularyAdapter::from_tokenizer(&wrapper)
+                .expect("Should create vocabulary");
+            let processor = SchemaProcessor::new();
+            // Schema with a string field - when generating content inside the string,
+            // many characters are valid, but the constraint is NOT complete
+            let schema = r#"{
+                "type": "object",
+                "properties": {
+                    "name": { "type": "string" }
+                },
+                "required": ["name"]
+            }"#;
+            let index = processor.process_schema(schema, &vocabulary)
+                .expect("Should process schema");
+            let mut config = GenerationConfig::default();
+            config.constraint = Some(index);
+            config.max_length = 100;
+            let mut text_gen = TextGeneration::new(&config);
+            text_gen.set_eos_token_id(102); // BERT's [SEP]
+            // At the initial state, the constraint should NOT be satisfied
+            // (we haven't generated a complete JSON object yet)
+            assert!(!text_gen.is_constraint_satisfied(),
+                "Initial state should not be satisfied - JSON not yet generated");
+            assert!(!text_gen.is_constraint_satisfied_stop_on_match(),
+                "Initial state should not trigger stop_on_match");
+        }
+    }
 }

data/ext/candle/src/llm/gemma.rs CHANGED Viewed

@@ -30,8 +30,8 @@ impl Gemma {
         &self.tokenizer
     }
-    /// Load a Gemma model from HuggingFace Hub
-    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+    /// Load a Gemma model from HuggingFace Hub with optional custom tokenizer
+    pub async fn from_pretrained_with_tokenizer(model_id: &str, device: Device, tokenizer_source: Option<&str>) -> CandleResult<Self> {
         let api = Api::new()
             .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
@@ -43,10 +43,23 @@ impl Gemma {
             .await
             .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
-        let tokenizer_filename = repo
-            .get("tokenizer.json")
-            .await
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        // Download tokenizer from custom source if provided, otherwise from model repo
+        let tokenizer = if let Some(tokenizer_id) = tokenizer_source {
+            let tokenizer_repo = api.repo(Repo::model(tokenizer_id.to_string()));
+            let tokenizer_filename = tokenizer_repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer from {}: {}", tokenizer_id, e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        } else {
+            let tokenizer_filename = repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        };
         // Try different file patterns for model weights
         let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
@@ -87,9 +100,6 @@ impl Gemma {
         let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)
             .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config: {}", e)))?;
-        // Load tokenizer
-        let tokenizer = Tokenizer::from_file(tokenizer_filename)
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
         // Gemma uses specific tokens
         let eos_token_id = {
@@ -116,6 +126,11 @@ impl Gemma {
         })
     }
+    /// Load a Gemma model from HuggingFace Hub (backwards compatibility)
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        Self::from_pretrained_with_tokenizer(model_id, device, None).await
+    }
     /// Create from existing components (useful for testing)
     pub fn new(
         model: GemmaModel,

data/ext/candle/src/llm/llama.rs CHANGED Viewed

@@ -37,8 +37,8 @@ impl Llama {
         &self.tokenizer
     }
-    /// Load a Llama model from HuggingFace Hub
-    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+    /// Load a Llama model from HuggingFace Hub with optional custom tokenizer
+    pub async fn from_pretrained_with_tokenizer(model_id: &str, device: Device, tokenizer_source: Option<&str>) -> CandleResult<Self> {
         let api = Api::new()
             .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
@@ -50,10 +50,45 @@ impl Llama {
             .await
             .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
-        let tokenizer_filename = repo
-            .get("tokenizer.json")
-            .await
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        // Download tokenizer from custom source if provided, otherwise from model repo
+        let tokenizer = if let Some(tokenizer_id) = tokenizer_source {
+            let tokenizer_repo = api.repo(Repo::model(tokenizer_id.to_string()));
+            let tokenizer_filename = tokenizer_repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| {
+                    let error_msg = if e.to_string().contains("404") || e.to_string().contains("Not Found") {
+                        format!("Tokenizer file 'tokenizer.json' not found in repository '{}'. The repository may not have a tokenizer.json file or may use a different format (e.g., tokenizer.model for SentencePiece).", tokenizer_id)
+                    } else if e.to_string().contains("401") || e.to_string().contains("Unauthorized") {
+                        format!("Authentication required to access tokenizer '{}'. You may need to set HF_TOKEN environment variable with a valid Hugging Face token.", tokenizer_id)
+                    } else if e.to_string().contains("timed out") || e.to_string().contains("connection") {
+                        format!("Network error downloading tokenizer from '{}': {}. Please check your internet connection.", tokenizer_id, e)
+                    } else {
+                        format!("Failed to download tokenizer from '{}': {}", tokenizer_id, e)
+                    };
+                    candle_core::Error::Msg(error_msg)
+                })?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer file: {}", e)))?
+        } else {
+            let tokenizer_filename = repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| {
+                    let error_msg = if e.to_string().contains("404") || e.to_string().contains("Not Found") {
+                        format!("No tokenizer found in model repository '{}'. The model may not include a tokenizer. Try specifying a tokenizer explicitly using the 'tokenizer' parameter, e.g.: from_pretrained('{}', tokenizer: 'appropriate-tokenizer-repo')", model_id, model_id)
+                    } else if e.to_string().contains("401") || e.to_string().contains("Unauthorized") {
+                        format!("Authentication required to access model '{}'. You may need to set HF_TOKEN environment variable with a valid Hugging Face token.", model_id)
+                    } else if e.to_string().contains("timed out") || e.to_string().contains("connection") {
+                        format!("Network error downloading tokenizer: {}. Please check your internet connection.", e)
+                    } else {
+                        format!("Failed to download tokenizer: {}", e)
+                    };
+                    candle_core::Error::Msg(error_msg)
+                })?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer file: {}", e)))?
+        };
         // Try different file patterns for model weights
         let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
@@ -97,10 +132,6 @@ impl Llama {
             .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config: {}", e)))?;
         let config = llama_config.into_config(false); // Don't use flash attention for now
-        // Load tokenizer
-        let tokenizer = Tokenizer::from_file(tokenizer_filename)
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
         // Determine EOS token ID based on model type
         let eos_token_id = if model_id.contains("Llama-3") || model_id.contains("llama-3") {
             // Llama 3 uses different special tokens
@@ -139,6 +170,11 @@ impl Llama {
         })
     }
+    /// Load a Llama model from HuggingFace Hub (backwards compatibility)
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        Self::from_pretrained_with_tokenizer(model_id, device, None).await
+    }
     /// Create from existing components (useful for testing)
     pub fn new(
         model: LlamaModel,

data/ext/candle/src/llm/mistral.rs CHANGED Viewed

@@ -30,8 +30,8 @@ impl Mistral {
         &self.tokenizer
     }
-    /// Load a Mistral model from HuggingFace Hub
-    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+    /// Load a Mistral model from HuggingFace Hub with optional custom tokenizer
+    pub async fn from_pretrained_with_tokenizer(model_id: &str, device: Device, tokenizer_source: Option<&str>) -> CandleResult<Self> {
         let api = Api::new()
             .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
@@ -43,10 +43,45 @@ impl Mistral {
             .await
             .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
-        let tokenizer_filename = repo
-            .get("tokenizer.json")
-            .await
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        // Download tokenizer from custom source if provided, otherwise from model repo
+        let tokenizer = if let Some(tokenizer_id) = tokenizer_source {
+            let tokenizer_repo = api.repo(Repo::model(tokenizer_id.to_string()));
+            let tokenizer_filename = tokenizer_repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| {
+                    let error_msg = if e.to_string().contains("404") || e.to_string().contains("Not Found") {
+                        format!("Tokenizer file 'tokenizer.json' not found in repository '{}'. The repository may not have a tokenizer.json file or may use a different format (e.g., tokenizer.model for SentencePiece).", tokenizer_id)
+                    } else if e.to_string().contains("401") || e.to_string().contains("Unauthorized") {
+                        format!("Authentication required to access tokenizer '{}'. You may need to set HF_TOKEN environment variable with a valid Hugging Face token.", tokenizer_id)
+                    } else if e.to_string().contains("timed out") || e.to_string().contains("connection") {
+                        format!("Network error downloading tokenizer from '{}': {}. Please check your internet connection.", tokenizer_id, e)
+                    } else {
+                        format!("Failed to download tokenizer from '{}': {}", tokenizer_id, e)
+                    };
+                    candle_core::Error::Msg(error_msg)
+                })?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer file: {}", e)))?
+        } else {
+            let tokenizer_filename = repo
+                .get("tokenizer.json")
+                .await
+                .map_err(|e| {
+                    let error_msg = if e.to_string().contains("404") || e.to_string().contains("Not Found") {
+                        format!("No tokenizer found in model repository '{}'. The model may not include a tokenizer. Try specifying a tokenizer explicitly using the 'tokenizer' parameter, e.g.: from_pretrained('{}', tokenizer: 'mistralai/Mistral-7B-Instruct-v0.2')", model_id, model_id)
+                    } else if e.to_string().contains("401") || e.to_string().contains("Unauthorized") {
+                        format!("Authentication required to access model '{}'. You may need to set HF_TOKEN environment variable with a valid Hugging Face token.", model_id)
+                    } else if e.to_string().contains("timed out") || e.to_string().contains("connection") {
+                        format!("Network error downloading tokenizer: {}. Please check your internet connection.", e)
+                    } else {
+                        format!("Failed to download tokenizer: {}", e)
+                    };
+                    candle_core::Error::Msg(error_msg)
+                })?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer file: {}", e)))?
+        };
         // Try different file patterns for model weights
         let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
@@ -97,10 +132,6 @@ impl Mistral {
         let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)
             .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config: {}", e)))?;
-        // Load tokenizer
-        let tokenizer = Tokenizer::from_file(tokenizer_filename)
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
         let eos_token_id = tokenizer
             .get_vocab(true)
             .get("</s>")
@@ -123,6 +154,11 @@ impl Mistral {
         })
     }
+    /// Load a Mistral model from HuggingFace Hub (backwards compatibility)
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        Self::from_pretrained_with_tokenizer(model_id, device, None).await
+    }
     /// Create from existing components (useful for testing)
     pub fn new(
         model: MistralModel,

data/ext/candle/src/llm/phi.rs CHANGED Viewed

@@ -38,8 +38,8 @@ impl Phi {
         }
     }
-    /// Load a Phi model from HuggingFace
-    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+    /// Load a Phi model from HuggingFace with optional custom tokenizer
+    pub async fn from_pretrained_with_tokenizer(model_id: &str, device: Device, tokenizer_source: Option<&str>) -> CandleResult<Self> {
         let api = Api::new()
             .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
@@ -50,11 +50,19 @@ impl Phi {
             .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
         let config_str = std::fs::read_to_string(config_filename)?;
-        // Download tokenizer
-        let tokenizer_filename = repo.get("tokenizer.json").await
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
-        let tokenizer = Tokenizer::from_file(tokenizer_filename)
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
+        // Download tokenizer from custom source if provided, otherwise from model repo
+        let tokenizer = if let Some(tokenizer_id) = tokenizer_source {
+            let tokenizer_repo = api.model(tokenizer_id.to_string());
+            let tokenizer_filename = tokenizer_repo.get("tokenizer.json").await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer from {}: {}", tokenizer_id, e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        } else {
+            let tokenizer_filename = repo.get("tokenizer.json").await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        };
         // Determine EOS token
         let vocab = tokenizer.get_vocab(true);
@@ -104,7 +112,62 @@ impl Phi {
         let model = if is_phi3 {
             // Load Phi3 model
-            let config: Phi3Config = serde_json::from_str(&config_str)
+            // Handle config differences between Phi-3-small and Phi-3-mini
+            let mut config_str_fixed;
+            // Parse config as JSON for modifications
+            let mut config_json: serde_json::Value = serde_json::from_str(&config_str)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config JSON: {}", e)))?;
+            // Phi-3-small uses ff_intermediate_size instead of intermediate_size
+            if config_json.get("ff_intermediate_size").is_some() && config_json.get("intermediate_size").is_none() {
+                if let Some(ff_size) = config_json.get("ff_intermediate_size").cloned() {
+                    config_json["intermediate_size"] = ff_size;
+                }
+            }
+            // Phi-3-small uses layer_norm_epsilon instead of rms_norm_eps
+            if config_json.get("layer_norm_epsilon").is_some() && config_json.get("rms_norm_eps").is_none() {
+                if let Some(eps) = config_json.get("layer_norm_epsilon").cloned() {
+                    config_json["rms_norm_eps"] = eps;
+                }
+            }
+            // Handle rope_scaling for long context models (Phi-3-mini-128k)
+            // Candle expects rope_scaling to be a string, but newer configs have it as an object
+            if let Some(rope_scaling) = config_json.get("rope_scaling") {
+                if rope_scaling.is_object() {
+                    // For now, just convert to the type string - candle will use default scaling
+                    if let Some(scaling_type) = rope_scaling.get("type").and_then(|v| v.as_str()) {
+                        config_json["rope_scaling"] = serde_json::Value::String(scaling_type.to_string());
+                    } else {
+                        // Remove it if we can't determine the type
+                        config_json.as_object_mut().unwrap().remove("rope_scaling");
+                    }
+                }
+            }
+            // Phi-3-small uses rope_embedding_base instead of rope_theta
+            if config_json.get("rope_embedding_base").is_some() && config_json.get("rope_theta").is_none() {
+                if let Some(rope_base) = config_json.get("rope_embedding_base").cloned() {
+                    config_json["rope_theta"] = rope_base;
+                }
+            }
+            config_str_fixed = serde_json::to_string(&config_json)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to serialize config: {}", e)))?;
+            // Check for unsupported gegelu activation
+            if config_str_fixed.contains("\"gegelu\"") {
+                // For now, map gegelu to gelu_pytorch_tanh with a warning
+                // This is not ideal but allows the model to at least load
+                eprintln!("WARNING: This model uses 'gegelu' activation which is not fully supported.");
+                eprintln!("         Mapping to 'gelu_pytorch_tanh' - results may be degraded.");
+                eprintln!("         For best results, use Phi-3-mini models instead.");
+                config_str_fixed = config_str_fixed.replace("\"gegelu\"", "\"gelu_pytorch_tanh\"");
+            }
+            let config: Phi3Config = serde_json::from_str(&config_str_fixed)
                 .map_err(|e| candle_core::Error::Msg(format!("Failed to parse Phi3 config: {}", e)))?;
             let vb = unsafe {
@@ -134,6 +197,11 @@ impl Phi {
             eos_token_id,
         })
     }
+    /// Load a Phi model from HuggingFace (backwards compatibility)
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        Self::from_pretrained_with_tokenizer(model_id, device, None).await
+    }
     /// Apply Phi chat template to messages
     pub fn apply_chat_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {

data/ext/candle/src/llm/qwen.rs CHANGED Viewed

@@ -30,8 +30,8 @@ impl Qwen {
         self.model.clear_kv_cache();
     }
-    /// Load a Qwen model from HuggingFace
-    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+    /// Load a Qwen model from HuggingFace with optional custom tokenizer
+    pub async fn from_pretrained_with_tokenizer(model_id: &str, device: Device, tokenizer_source: Option<&str>) -> CandleResult<Self> {
         let api = Api::new()
             .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
@@ -44,19 +44,27 @@ impl Qwen {
         let config: Config = serde_json::from_str(&config_str)
             .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config: {}", e)))?;
-        // Download tokenizer
-        let tokenizer_filename = repo.get("tokenizer.json").await
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
-        let tokenizer = Tokenizer::from_file(tokenizer_filename)
-            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
+        // Download tokenizer from custom source if provided, otherwise from model repo
+        let tokenizer = if let Some(tokenizer_id) = tokenizer_source {
+            let tokenizer_repo = api.model(tokenizer_id.to_string());
+            let tokenizer_filename = tokenizer_repo.get("tokenizer.json").await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer from {}: {}", tokenizer_id, e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        } else {
+            let tokenizer_filename = repo.get("tokenizer.json").await
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+            Tokenizer::from_file(tokenizer_filename)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?
+        };
         // Determine EOS token
         let vocab = tokenizer.get_vocab(true);
-        let eos_token_id = vocab.get("<|endoftext|>")
-            .or_else(|| vocab.get("<|im_end|>"))
+        let eos_token_id = vocab.get("<|im_end|>")
+            .or_else(|| vocab.get("<|endoftext|>"))
             .or_else(|| vocab.get("</s>"))
             .copied()
-            .unwrap_or(151643); // Default Qwen3 EOS token
+            .unwrap_or(151645); // Default Qwen2.5 EOS token
         // Download model weights
         // NOTE: Qwen uses hardcoded shard counts based on model size rather than
@@ -97,6 +105,11 @@ impl Qwen {
         })
     }
+    /// Load a Qwen model from HuggingFace (backwards compatibility)
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        Self::from_pretrained_with_tokenizer(model_id, device, None).await
+    }
     /// Apply Qwen chat template to messages
     pub fn apply_chat_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
         let mut prompt = String::new();