RubyGems - red-candle - Versions diffs - 1.0.2 → 1.1.0 - Mend

red-candle 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Cargo.lock +244 -6
data/README.md +36 -2
data/Rakefile +46 -1
data/ext/candle/Cargo.toml +2 -0
data/ext/candle/src/lib.rs +2 -0
data/ext/candle/src/llm/constrained_generation_test.rs +123 -0
data/ext/candle/src/llm/generation_config.rs +5 -0
data/ext/candle/src/llm/mod.rs +5 -0
data/ext/candle/src/llm/phi.rs +285 -0
data/ext/candle/src/llm/quantized_gguf.rs +155 -4
data/ext/candle/src/llm/qwen.rs +229 -0
data/ext/candle/src/llm/text_generation.rs +66 -2
data/ext/candle/src/ruby/device.rs +5 -0
data/ext/candle/src/ruby/llm.rs +42 -4
data/ext/candle/src/ruby/mod.rs +1 -0
data/ext/candle/src/ruby/structured.rs +47 -0
data/ext/candle/src/structured/integration_test.rs +130 -0
data/ext/candle/src/structured/mod.rs +31 -0
data/ext/candle/src/structured/schema_processor.rs +215 -0
data/ext/candle/src/structured/vocabulary_adapter.rs +152 -0
data/ext/candle/src/structured/vocabulary_adapter_real_test.rs +66 -0
data/ext/candle/src/structured/vocabulary_adapter_simple_test.rs +70 -0
data/lib/candle/llm.rb +109 -3
data/lib/candle/version.rb +1 -1
metadata +14 -4

data/ext/candle/src/llm/generation_config.rs CHANGED Viewed

@@ -1,4 +1,6 @@
 use std::time::{SystemTime, UNIX_EPOCH};
+use std::sync::Arc;
+use crate::structured::Index;
 /// Configuration for text generation
 #[derive(Debug, Clone)]
@@ -23,6 +25,8 @@ pub struct GenerationConfig {
     pub include_prompt: bool,
     /// Whether to show raw tokens during generation (for debugging)
     pub debug_tokens: bool,
+    /// Optional constraint index for structured generation
+    pub constraint: Option<Arc<Index>>,
 }
 /// Generate a random seed based on current time
@@ -46,6 +50,7 @@ impl Default for GenerationConfig {
             stop_sequences: vec![],
             include_prompt: false,
             debug_tokens: false,
+            constraint: None,
         }
     }
 }

data/ext/candle/src/llm/mod.rs CHANGED Viewed

@@ -3,6 +3,8 @@ use candle_core::{Device, Result as CandleResult};
 pub mod mistral;
 pub mod llama;
 pub mod gemma;
+pub mod qwen;
+pub mod phi;
 pub mod generation_config;
 pub mod text_generation;
 pub mod quantized_gguf;
@@ -12,6 +14,9 @@ pub use text_generation::TextGeneration;
 pub use quantized_gguf::QuantizedGGUF;
 pub use crate::tokenizer::TokenizerWrapper;
+#[cfg(test)]
+mod constrained_generation_test;
 /// Trait for text generation models
 pub trait TextGenerator: Send + Sync {
     /// Generate text from a prompt

data/ext/candle/src/llm/phi.rs ADDED Viewed

@@ -0,0 +1,285 @@
+use candle_core::{DType, Device, Result as CandleResult, Tensor};
+use candle_transformers::models::phi::{Config, Model as PhiModel};
+use candle_transformers::models::phi3::{Config as Phi3Config, Model as Phi3Model};
+use hf_hub::api::tokio::Api;
+use tokenizers::Tokenizer;
+use crate::llm::{GenerationConfig, TextGeneration, TextGenerator, TokenizerWrapper};
+/// Phi model wrapper for text generation
+pub struct Phi {
+    model: PhiVariant,
+    tokenizer: TokenizerWrapper,
+    device: Device,
+    model_id: String,
+    eos_token_id: u32,
+}
+enum PhiVariant {
+    Phi2(PhiModel),
+    Phi3(Phi3Model),
+}
+impl Phi {
+    /// Get the tokenizer
+    pub fn tokenizer(&self) -> &TokenizerWrapper {
+        &self.tokenizer
+    }
+    /// Clear the KV cache between generations
+    pub fn clear_kv_cache(&mut self) {
+        match &mut self.model {
+            PhiVariant::Phi2(model) => model.clear_kv_cache(),
+            PhiVariant::Phi3(model) => model.clear_kv_cache(),
+        }
+    }
+    /// Load a Phi model from HuggingFace
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        let api = Api::new()
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
+        let repo = api.model(model_id.to_string());
+        // Download configuration
+        let config_filename = repo.get("config.json").await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
+        let config_str = std::fs::read_to_string(config_filename)?;
+        // Download tokenizer
+        let tokenizer_filename = repo.get("tokenizer.json").await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        let tokenizer = Tokenizer::from_file(tokenizer_filename)
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
+        // Determine EOS token
+        let vocab = tokenizer.get_vocab(true);
+        let eos_token_id = vocab.get("<|endoftext|>")
+            .or_else(|| vocab.get("<|end|>"))
+            .or_else(|| vocab.get("</s>"))
+            .copied()
+            .unwrap_or(50256); // Default GPT-2 style EOS token
+        // Determine model variant based on model_id or config
+        let is_phi3 = model_id.contains("phi-3") || model_id.contains("Phi-3");
+        // Download model weights (handle both single and sharded files)
+        let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
+            vec![single_file]
+        } else {
+            // Try to find sharded model files
+            let mut sharded_files = Vec::new();
+            let mut index = 1;
+            loop {
+                // Try common shard counts
+                let mut found = false;
+                for total in [2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 30] {
+                    let filename = format!("model-{:05}-of-{:05}.safetensors", index, total);
+                    if let Ok(file) = repo.get(&filename).await {
+                        sharded_files.push(file);
+                        found = true;
+                        break;
+                    }
+                }
+                if !found {
+                    break;
+                }
+                index += 1;
+            }
+            if sharded_files.is_empty() {
+                return Err(candle_core::Error::Msg(
+                    "Could not find model weights. Tried: model.safetensors, model-*-of-*.safetensors".to_string()
+                ));
+            }
+            sharded_files
+        };
+        let model = if is_phi3 {
+            // Load Phi3 model
+            let config: Phi3Config = serde_json::from_str(&config_str)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to parse Phi3 config: {}", e)))?;
+            let vb = unsafe {
+                candle_nn::VarBuilder::from_mmaped_safetensors(&weights_filenames, DType::F32, &device)?
+            };
+            let model = Phi3Model::new(&config, vb)?;
+            PhiVariant::Phi3(model)
+        } else {
+            // Load Phi2 model
+            let config: Config = serde_json::from_str(&config_str)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to parse Phi config: {}", e)))?;
+            let vb = unsafe {
+                candle_nn::VarBuilder::from_mmaped_safetensors(&weights_filenames, DType::F32, &device)?
+            };
+            let model = PhiModel::new(&config, vb)?;
+            PhiVariant::Phi2(model)
+        };
+        Ok(Self {
+            model,
+            tokenizer: TokenizerWrapper::new(tokenizer),
+            device,
+            model_id: model_id.to_string(),
+            eos_token_id,
+        })
+    }
+    /// Apply Phi chat template to messages
+    pub fn apply_chat_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        // Phi-3 uses a specific format
+        if matches!(self.model, PhiVariant::Phi3(_)) {
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => {
+                        prompt.push_str(&format!("<|system|>\n{}<|end|>\n", content));
+                    }
+                    "user" => {
+                        prompt.push_str(&format!("<|user|>\n{}<|end|>\n", content));
+                    }
+                    "assistant" => {
+                        prompt.push_str(&format!("<|assistant|>\n{}<|end|>\n", content));
+                    }
+                    _ => {}
+                }
+            }
+            prompt.push_str("<|assistant|>\n");
+        } else {
+            // Phi-2 uses a simpler format
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => prompt.push_str(&format!("System: {}\n", content)),
+                    "user" => prompt.push_str(&format!("User: {}\n", content)),
+                    "assistant" => prompt.push_str(&format!("Assistant: {}\n", content)),
+                    _ => {}
+                }
+            }
+            prompt.push_str("Assistant: ");
+        }
+        Ok(prompt)
+    }
+    fn generate_tokens(
+        &mut self,
+        prompt_tokens: Vec<u32>,
+        config: &GenerationConfig,
+        mut callback: Option<impl FnMut(&str)>,
+    ) -> CandleResult<Vec<u32>> {
+        let mut text_gen = TextGeneration::from_config(config);
+        text_gen.set_eos_token_id(self.eos_token_id);
+        text_gen.set_tokens(prompt_tokens.clone());
+        let mut all_tokens = prompt_tokens.clone();
+        let start_gen = all_tokens.len();
+        for index in 0..config.max_length {
+            let context_size = if index > 0 { 1 } else { all_tokens.len() };
+            let start_pos = all_tokens.len().saturating_sub(context_size);
+            let ctxt = &all_tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = match &mut self.model {
+                PhiVariant::Phi2(model) => model.forward(&input)?,
+                PhiVariant::Phi3(model) => model.forward(&input, start_pos)?,
+            };
+            let logits = logits.squeeze(0)?;
+            // Handle different output shapes
+            let logits = if logits.dims().len() == 2 {
+                let seq_len = logits.dim(0)?;
+                logits.narrow(0, seq_len - 1, 1)?.squeeze(0)?
+            } else {
+                logits
+            };
+            let logits = logits.to_dtype(DType::F32)?;
+            let next_token = text_gen.sample_next_token(
+                &logits,
+                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
+            )?;
+            all_tokens.push(next_token);
+            // Stream callback
+            if let Some(ref mut cb) = callback {
+                if config.debug_tokens {
+                    let token_piece = self.tokenizer.token_to_piece(next_token)?;
+                    cb(&format!("[{}:{}]", next_token, token_piece));
+                } else {
+                    let decoded_text = self.tokenizer.decode_incremental(&all_tokens, all_tokens.len() - 1)?;
+                    cb(&decoded_text);
+                }
+            }
+            // Check stop conditions
+            if text_gen.should_stop(next_token, config.max_length) {
+                break;
+            }
+            // Check stop sequences
+            let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
+            if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {
+                break;
+            }
+        }
+        Ok(if config.include_prompt {
+            all_tokens
+        } else {
+            all_tokens[start_gen..].to_vec()
+        })
+    }
+}
+impl TextGenerator for Phi {
+    fn generate(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens(prompt_tokens, config, None::<fn(&str)>)?;
+        if config.debug_tokens {
+            self.tokenizer.format_tokens_with_debug(&output_tokens)
+        } else {
+            self.tokenizer.decode(&output_tokens, true)
+        }
+    }
+    fn generate_stream(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+        mut callback: impl FnMut(&str),
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens(prompt_tokens, config, Some(&mut callback))?;
+        self.tokenizer.decode(&output_tokens, true)
+    }
+    fn model_name(&self) -> &str {
+        &self.model_id
+    }
+    fn device(&self) -> &Device {
+        &self.device
+    }
+    fn clear_cache(&mut self) {
+        self.clear_kv_cache();
+    }
+}

data/ext/candle/src/llm/quantized_gguf.rs CHANGED Viewed

@@ -2,6 +2,9 @@ use candle_core::{DType, Device, Result as CandleResult, Tensor};
 use candle_core::quantized::gguf_file;
 use candle_transformers::models::quantized_llama::ModelWeights as QuantizedLlamaModel;
 use candle_transformers::models::quantized_gemma3::ModelWeights as QuantizedGemmaModel;
+use candle_transformers::models::quantized_qwen2::ModelWeights as QuantizedQwenModel;
+use candle_transformers::models::quantized_phi::ModelWeights as QuantizedPhiModel;
+use candle_transformers::models::quantized_phi3::ModelWeights as QuantizedPhi3Model;
 use hf_hub::api::tokio::{Api, ApiRepo};
 use tokenizers::Tokenizer;
 use std::io::Seek;
@@ -9,7 +12,6 @@ use std::io::Seek;
 use crate::llm::{GenerationConfig, TextGeneration, TextGenerator, TokenizerWrapper};
 /// Unified GGUF model that can load any GGUF file and detect the architecture
-#[derive(Debug)]
 pub struct QuantizedGGUF {
     model: ModelType,
     tokenizer: TokenizerWrapper,
@@ -20,10 +22,12 @@ pub struct QuantizedGGUF {
     _chat_template: Option<String>,
 }
-#[derive(Debug)]
 enum ModelType {
     Llama(QuantizedLlamaModel),
     Gemma(QuantizedGemmaModel),
+    Qwen(QuantizedQwenModel),
+    Phi(QuantizedPhiModel),
+    Phi3(QuantizedPhi3Model),
     // Mistral uses Llama loader due to tensor naming compatibility
 }
@@ -97,6 +101,34 @@ impl QuantizedGGUF {
                 let model = QuantizedLlamaModel::from_gguf(content, &mut file, &device)?;
                 ModelType::Llama(model)
             }
+            "qwen" | "qwen2" | "qwen3" => {
+                // Try different loaders based on what metadata is available
+                if content.metadata.contains_key("llama.attention.head_count") {
+                    let model = QuantizedLlamaModel::from_gguf(content, &mut file, &device)?;
+                    ModelType::Llama(model)
+                } else if content.metadata.contains_key("qwen2.attention.head_count") {
+                    let model = QuantizedQwenModel::from_gguf(content, &mut file, &device)?;
+                    ModelType::Qwen(model)
+                } else if content.metadata.contains_key("qwen3.attention.head_count") {
+                    // Qwen3 GGUF files use a different metadata format
+                    // The quantized_qwen3 module is not yet in the released version of candle-transformers
+                    return Err(candle_core::Error::Msg(format!(
+                        "Qwen3 GGUF format detected but not yet fully supported.\n\n\
+                        The file contains qwen3.* metadata keys which require candle-transformers > 0.9.1.\n\n\
+                        Current alternatives:\n\
+                        1. Use Qwen2.5 GGUF models which work well:\n\
+                           - Qwen/Qwen2.5-7B-Instruct-GGUF (recommended)\n\
+                           - Qwen/Qwen2.5-32B-Instruct-GGUF\n\
+                        2. Use non-quantized Qwen models with safetensors\n\
+                        3. Wait for candle-transformers update with quantized_qwen3 support\n\n\
+                        Note: Qwen2.5 models have similar capabilities to Qwen3."
+                    )));
+                } else {
+                    // Last resort: try llama loader anyway, as it's the most common
+                    let model = QuantizedLlamaModel::from_gguf(content, &mut file, &device)?;
+                    ModelType::Llama(model)
+                }
+            }
             "gemma" | "gemma2" | "gemma3" => {
                 // Try Gemma-specific loader first, fall back to Llama if it fails
                 match QuantizedGemmaModel::from_gguf(content, &mut file, &device) {
@@ -112,9 +144,20 @@ impl QuantizedGGUF {
                     Err(e) => return Err(e),
                 }
             }
+            "phi" | "phi2" => {
+                let model = QuantizedPhiModel::from_gguf(content, &mut file, &device)?;
+                ModelType::Phi(model)
+            }
+            "phi3" => {
+                // QuantizedPhi3Model requires an additional `approx` parameter
+                // Setting to false to avoid performance issues without flash-attn
+                let approx = false;
+                let model = QuantizedPhi3Model::from_gguf(approx, content, &mut file, &device)?;
+                ModelType::Phi3(model)
+            }
             _ => {
                 return Err(candle_core::Error::Msg(format!(
-                    "Unsupported architecture: {}. Supported: llama, mistral, gemma",
+                    "Unsupported architecture: {}. Supported: llama, mistral, gemma, qwen, qwen2, qwen3, phi, phi2, phi3",
                     architecture
                 )));
             }
@@ -149,6 +192,14 @@ impl QuantizedGGUF {
             Ok("mistral".to_string())
         } else if model_lower.contains("gemma") {
             Ok("gemma".to_string())
+        } else if model_lower.contains("qwen") {
+            Ok("qwen".to_string())
+        } else if model_lower.contains("phi-3") || model_lower.contains("phi3") {
+            Ok("phi3".to_string())
+        } else if model_lower.contains("phi-2") || model_lower.contains("phi2") {
+            Ok("phi2".to_string())
+        } else if model_lower.contains("phi") {
+            Ok("phi".to_string())
         } else {
             Err(candle_core::Error::Msg(
                 "Could not determine model architecture from metadata or name".to_string()
@@ -235,6 +286,20 @@ impl QuantizedGGUF {
                     .copied()
                     .unwrap_or(1)
             }
+            "qwen" | "qwen2" | "qwen3" => {
+                vocab.get("<|endoftext|>")
+                    .or_else(|| vocab.get("<|im_end|>"))
+                    .or_else(|| vocab.get("</s>"))
+                    .copied()
+                    .unwrap_or(151643) // Default Qwen3 EOS token
+            }
+            "phi" | "phi2" | "phi3" => {
+                vocab.get("<|endoftext|>")
+                    .or_else(|| vocab.get("<|end|>"))
+                    .or_else(|| vocab.get("</s>"))
+                    .copied()
+                    .unwrap_or(50256) // Default GPT-2 style EOS token
+            }
             _ => 2, // Default
         }
     }
@@ -256,6 +321,10 @@ impl QuantizedGGUF {
         } else if model_lower.contains("gemma") {
             // Always use Gemma template for Gemma models, regardless of loader used
             self.apply_gemma_template(messages)
+        } else if model_lower.contains("qwen") {
+            self.apply_qwen_template(messages)
+        } else if model_lower.contains("phi") {
+            self.apply_phi_template(messages)
         } else {
             match self.architecture.as_str() {
                 "llama" => {
@@ -268,6 +337,12 @@ impl QuantizedGGUF {
                 "gemma" => {
                     self.apply_gemma_template(messages)
                 }
+                "qwen" | "qwen2" | "qwen3" => {
+                    self.apply_qwen_template(messages)
+                }
+                "phi" | "phi2" | "phi3" => {
+                    self.apply_phi_template(messages)
+                }
                 _ => Ok(self.apply_generic_template(messages))
             }
         }
@@ -366,6 +441,77 @@ impl QuantizedGGUF {
         Ok(prompt)
     }
+    fn apply_qwen_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        for message in messages {
+            let role = message["role"].as_str().unwrap_or("");
+            let content = message["content"].as_str().unwrap_or("");
+            match role {
+                "system" => {
+                    prompt.push_str(&format!("<|im_start|>system\n{}<|im_end|>\n", content));
+                }
+                "user" => {
+                    prompt.push_str(&format!("<|im_start|>user\n{}<|im_end|>\n", content));
+                }
+                "assistant" => {
+                    prompt.push_str(&format!("<|im_start|>assistant\n{}<|im_end|>\n", content));
+                }
+                _ => {}
+            }
+        }
+        // Add generation prompt
+        prompt.push_str("<|im_start|>assistant\n");
+        Ok(prompt)
+    }
+    fn apply_phi_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        // Check if it's Phi-3 (newer format) or Phi-2/Phi (simpler format)
+        let is_phi3 = self.model_id.contains("phi-3") || self.model_id.contains("Phi-3") || self.architecture == "phi3";
+        if is_phi3 {
+            // Phi-3 format
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => {
+                        prompt.push_str(&format!("<|system|>\n{}<|end|>\n", content));
+                    }
+                    "user" => {
+                        prompt.push_str(&format!("<|user|>\n{}<|end|>\n", content));
+                    }
+                    "assistant" => {
+                        prompt.push_str(&format!("<|assistant|>\n{}<|end|>\n", content));
+                    }
+                    _ => {}
+                }
+            }
+            prompt.push_str("<|assistant|>\n");
+        } else {
+            // Phi-2 format
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => prompt.push_str(&format!("System: {}\n", content)),
+                    "user" => prompt.push_str(&format!("User: {}\n", content)),
+                    "assistant" => prompt.push_str(&format!("Assistant: {}\n", content)),
+                    _ => {}
+                }
+            }
+            prompt.push_str("Assistant: ");
+        }
+        Ok(prompt)
+    }
     fn apply_generic_template(&self, messages: &[serde_json::Value]) -> String {
         let mut prompt = String::new();
@@ -381,7 +527,9 @@ impl QuantizedGGUF {
     /// Clear the KV cache between generations
     pub fn clear_kv_cache(&mut self) {
-        // Quantized models manage cache internally
+        // Quantized models don't expose cache clearing methods
+        // Phi3 GGUF models have a known issue where the KV cache
+        // cannot be cleared, leading to errors on subsequent generations
     }
     fn generate_tokens(
@@ -408,6 +556,9 @@ impl QuantizedGGUF {
             let logits = match &mut self.model {
                 ModelType::Llama(model) => model.forward(&input, start_pos)?,
                 ModelType::Gemma(model) => model.forward(&input, start_pos)?,
+                ModelType::Qwen(model) => model.forward(&input, start_pos)?,
+                ModelType::Phi(model) => model.forward(&input, start_pos)?,
+                ModelType::Phi3(model) => model.forward(&input, start_pos)?,
             };
             let logits = logits.squeeze(0)?;