RubyGems - red-candle - Versions diffs - 1.0.1 → 1.1.0 - Mend

red-candle 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/Cargo.lock +244 -6
data/README.md +57 -4
data/Rakefile +46 -1
data/ext/candle/Cargo.toml +2 -0
data/ext/candle/build.rs +6 -5
data/ext/candle/extconf.rb +5 -6
data/ext/candle/src/lib.rs +2 -0
data/ext/candle/src/llm/constrained_generation_test.rs +123 -0
data/ext/candle/src/llm/generation_config.rs +5 -0
data/ext/candle/src/llm/mod.rs +5 -0
data/ext/candle/src/llm/phi.rs +285 -0
data/ext/candle/src/llm/quantized_gguf.rs +155 -4
data/ext/candle/src/llm/qwen.rs +229 -0
data/ext/candle/src/llm/text_generation.rs +66 -2
data/ext/candle/src/ruby/device.rs +5 -0
data/ext/candle/src/ruby/llm.rs +42 -4
data/ext/candle/src/ruby/mod.rs +1 -0
data/ext/candle/src/ruby/structured.rs +47 -0
data/ext/candle/src/structured/integration_test.rs +130 -0
data/ext/candle/src/structured/mod.rs +31 -0
data/ext/candle/src/structured/schema_processor.rs +215 -0
data/ext/candle/src/structured/vocabulary_adapter.rs +152 -0
data/ext/candle/src/structured/vocabulary_adapter_real_test.rs +66 -0
data/ext/candle/src/structured/vocabulary_adapter_simple_test.rs +70 -0
data/lib/candle/build_info.rb +2 -2
data/lib/candle/llm.rb +109 -3
data/lib/candle/version.rb +1 -1
data/lib/red-candle.rb +1 -0
metadata +15 -4

data/ext/candle/src/llm/constrained_generation_test.rs ADDED Viewed

@@ -0,0 +1,123 @@
+#[cfg(test)]
+mod constrained_generation_tests {
+    use super::super::*;
+    use crate::structured::{VocabularyAdapter, SchemaProcessor};
+    use crate::tokenizer::{TokenizerWrapper, loader::TokenizerLoader};
+    #[tokio::test]
+    async fn test_constrained_vs_unconstrained_generation() {
+        // This test demonstrates the difference between constrained and unconstrained generation
+        // Load a tokenizer for testing
+        if let Ok(tokenizer) = TokenizerLoader::from_hf_hub("bert-base-uncased", None).await {
+            let wrapper = TokenizerWrapper::new(tokenizer);
+            // Create vocabulary adapter
+            let vocabulary = VocabularyAdapter::from_tokenizer(&wrapper)
+                .expect("Should create vocabulary");
+            // Create schema processor
+            let processor = SchemaProcessor::new();
+            // Define a simple JSON schema for a yes/no response
+            let schema = r#"{
+                "type": "object",
+                "properties": {
+                    "answer": {
+                        "type": "string",
+                        "enum": ["yes", "no"]
+                    }
+                },
+                "required": ["answer"]
+            }"#;
+            // Process schema into Index
+            let index = processor.process_schema(schema, &vocabulary)
+                .expect("Should process schema");
+            // Test configuration with constraint
+            let mut config_with_constraint = GenerationConfig::default();
+            config_with_constraint.constraint = Some(index.clone());
+            config_with_constraint.max_length = 50;
+            // Test configuration without constraint
+            let config_without_constraint = GenerationConfig::default();
+            // Create text generation instances
+            let mut gen_constrained = TextGeneration::from_config(&config_with_constraint);
+            let mut gen_unconstrained = TextGeneration::from_config(&config_without_constraint);
+            // Set EOS token
+            gen_constrained.set_eos_token_id(102); // BERT's [SEP] token
+            gen_unconstrained.set_eos_token_id(102);
+            // Constraints are set internally - we can't directly verify them
+            // but we can test their effects in actual generation
+        }
+    }
+    #[test]
+    fn test_constraint_configuration() {
+        // Test that we can create a TextGeneration with constraints
+        let config = GenerationConfig::default();
+        let _text_gen = TextGeneration::from_config(&config);
+        // Test that we can create a TextGeneration from config
+        // Constraints are private implementation details
+    }
+    #[test]
+    fn test_repetition_penalty() {
+        use candle_core::{Tensor, Device};
+        let device = Device::Cpu;
+        let vocab_size = 10;
+        // Create logits with some positive and negative values
+        let logits_vec: Vec<f32> = vec![1.0, -1.0, 2.0, -2.0, 0.0, 3.0, -3.0, 1.5, -1.5, 0.5];
+        let mut logits = Tensor::from_vec(logits_vec.clone(), vocab_size, &device).unwrap();
+        // Create text generation with some tokens
+        let mut text_gen = TextGeneration::new(42, Some(1.0), None, None, 1.0, 64);
+        text_gen.push_token(0); // Token that had logit 1.0
+        text_gen.push_token(2); // Token that had logit 2.0
+        text_gen.push_token(5); // Token that had logit 3.0
+        // Apply repetition penalty
+        text_gen.apply_repetition_penalty(&mut logits, 1.5, 10).unwrap();
+        let penalized = logits.to_vec1::<f32>().unwrap();
+        // Check that tokens in context were penalized
+        assert!(penalized[0] < logits_vec[0], "Positive logit should be reduced");
+        assert!(penalized[2] < logits_vec[2], "Positive logit should be reduced");
+        assert!(penalized[5] < logits_vec[5], "Positive logit should be reduced");
+        // Check that other tokens remain unchanged
+        assert_eq!(penalized[1], logits_vec[1], "Unsampled token should be unchanged");
+        assert_eq!(penalized[3], logits_vec[3], "Unsampled token should be unchanged");
+    }
+    #[test]
+    fn test_stop_conditions() {
+        let mut text_gen = TextGeneration::new(42, Some(1.0), None, None, 1.0, 64);
+        text_gen.set_eos_token_id(50256); // Common EOS token
+        // Test max length stop
+        for i in 0..10 {
+            text_gen.push_token(i);
+        }
+        assert!(text_gen.should_stop(100, 10), "Should stop at max length");
+        assert!(!text_gen.should_stop(100, 20), "Should not stop before max length");
+        // Test EOS token stop
+        assert!(text_gen.should_stop(50256, 100), "Should stop at EOS token");
+        assert!(!text_gen.should_stop(123, 100), "Should not stop at non-EOS token");
+        // Test stop sequences
+        let stop_seqs = vec!["STOP".to_string(), "END".to_string()];
+        assert!(text_gen.check_stop_sequences("This is the STOP", &stop_seqs), "Should detect stop sequence");
+        assert!(text_gen.check_stop_sequences("The END", &stop_seqs), "Should detect stop sequence");
+        assert!(!text_gen.check_stop_sequences("Continue", &stop_seqs), "Should not detect stop sequence");
+    }
+}

data/ext/candle/src/llm/generation_config.rs CHANGED Viewed

@@ -1,4 +1,6 @@
 use std::time::{SystemTime, UNIX_EPOCH};
+use std::sync::Arc;
+use crate::structured::Index;
 /// Configuration for text generation
 #[derive(Debug, Clone)]
@@ -23,6 +25,8 @@ pub struct GenerationConfig {
     pub include_prompt: bool,
     /// Whether to show raw tokens during generation (for debugging)
     pub debug_tokens: bool,
+    /// Optional constraint index for structured generation
+    pub constraint: Option<Arc<Index>>,
 }
 /// Generate a random seed based on current time
@@ -46,6 +50,7 @@ impl Default for GenerationConfig {
             stop_sequences: vec![],
             include_prompt: false,
             debug_tokens: false,
+            constraint: None,
         }
     }
 }

data/ext/candle/src/llm/mod.rs CHANGED Viewed

@@ -3,6 +3,8 @@ use candle_core::{Device, Result as CandleResult};
 pub mod mistral;
 pub mod llama;
 pub mod gemma;
+pub mod qwen;
+pub mod phi;
 pub mod generation_config;
 pub mod text_generation;
 pub mod quantized_gguf;
@@ -12,6 +14,9 @@ pub use text_generation::TextGeneration;
 pub use quantized_gguf::QuantizedGGUF;
 pub use crate::tokenizer::TokenizerWrapper;
+#[cfg(test)]
+mod constrained_generation_test;
 /// Trait for text generation models
 pub trait TextGenerator: Send + Sync {
     /// Generate text from a prompt

data/ext/candle/src/llm/phi.rs ADDED Viewed

@@ -0,0 +1,285 @@
+use candle_core::{DType, Device, Result as CandleResult, Tensor};
+use candle_transformers::models::phi::{Config, Model as PhiModel};
+use candle_transformers::models::phi3::{Config as Phi3Config, Model as Phi3Model};
+use hf_hub::api::tokio::Api;
+use tokenizers::Tokenizer;
+use crate::llm::{GenerationConfig, TextGeneration, TextGenerator, TokenizerWrapper};
+/// Phi model wrapper for text generation
+pub struct Phi {
+    model: PhiVariant,
+    tokenizer: TokenizerWrapper,
+    device: Device,
+    model_id: String,
+    eos_token_id: u32,
+}
+enum PhiVariant {
+    Phi2(PhiModel),
+    Phi3(Phi3Model),
+}
+impl Phi {
+    /// Get the tokenizer
+    pub fn tokenizer(&self) -> &TokenizerWrapper {
+        &self.tokenizer
+    }
+    /// Clear the KV cache between generations
+    pub fn clear_kv_cache(&mut self) {
+        match &mut self.model {
+            PhiVariant::Phi2(model) => model.clear_kv_cache(),
+            PhiVariant::Phi3(model) => model.clear_kv_cache(),
+        }
+    }
+    /// Load a Phi model from HuggingFace
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        let api = Api::new()
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
+        let repo = api.model(model_id.to_string());
+        // Download configuration
+        let config_filename = repo.get("config.json").await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
+        let config_str = std::fs::read_to_string(config_filename)?;
+        // Download tokenizer
+        let tokenizer_filename = repo.get("tokenizer.json").await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        let tokenizer = Tokenizer::from_file(tokenizer_filename)
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
+        // Determine EOS token
+        let vocab = tokenizer.get_vocab(true);
+        let eos_token_id = vocab.get("<|endoftext|>")
+            .or_else(|| vocab.get("<|end|>"))
+            .or_else(|| vocab.get("</s>"))
+            .copied()
+            .unwrap_or(50256); // Default GPT-2 style EOS token
+        // Determine model variant based on model_id or config
+        let is_phi3 = model_id.contains("phi-3") || model_id.contains("Phi-3");
+        // Download model weights (handle both single and sharded files)
+        let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
+            vec![single_file]
+        } else {
+            // Try to find sharded model files
+            let mut sharded_files = Vec::new();
+            let mut index = 1;
+            loop {
+                // Try common shard counts
+                let mut found = false;
+                for total in [2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 30] {
+                    let filename = format!("model-{:05}-of-{:05}.safetensors", index, total);
+                    if let Ok(file) = repo.get(&filename).await {
+                        sharded_files.push(file);
+                        found = true;
+                        break;
+                    }
+                }
+                if !found {
+                    break;
+                }
+                index += 1;
+            }
+            if sharded_files.is_empty() {
+                return Err(candle_core::Error::Msg(
+                    "Could not find model weights. Tried: model.safetensors, model-*-of-*.safetensors".to_string()
+                ));
+            }
+            sharded_files
+        };
+        let model = if is_phi3 {
+            // Load Phi3 model
+            let config: Phi3Config = serde_json::from_str(&config_str)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to parse Phi3 config: {}", e)))?;
+            let vb = unsafe {
+                candle_nn::VarBuilder::from_mmaped_safetensors(&weights_filenames, DType::F32, &device)?
+            };
+            let model = Phi3Model::new(&config, vb)?;
+            PhiVariant::Phi3(model)
+        } else {
+            // Load Phi2 model
+            let config: Config = serde_json::from_str(&config_str)
+                .map_err(|e| candle_core::Error::Msg(format!("Failed to parse Phi config: {}", e)))?;
+            let vb = unsafe {
+                candle_nn::VarBuilder::from_mmaped_safetensors(&weights_filenames, DType::F32, &device)?
+            };
+            let model = PhiModel::new(&config, vb)?;
+            PhiVariant::Phi2(model)
+        };
+        Ok(Self {
+            model,
+            tokenizer: TokenizerWrapper::new(tokenizer),
+            device,
+            model_id: model_id.to_string(),
+            eos_token_id,
+        })
+    }
+    /// Apply Phi chat template to messages
+    pub fn apply_chat_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        // Phi-3 uses a specific format
+        if matches!(self.model, PhiVariant::Phi3(_)) {
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => {
+                        prompt.push_str(&format!("<|system|>\n{}<|end|>\n", content));
+                    }
+                    "user" => {
+                        prompt.push_str(&format!("<|user|>\n{}<|end|>\n", content));
+                    }
+                    "assistant" => {
+                        prompt.push_str(&format!("<|assistant|>\n{}<|end|>\n", content));
+                    }
+                    _ => {}
+                }
+            }
+            prompt.push_str("<|assistant|>\n");
+        } else {
+            // Phi-2 uses a simpler format
+            for message in messages {
+                let role = message["role"].as_str().unwrap_or("");
+                let content = message["content"].as_str().unwrap_or("");
+                match role {
+                    "system" => prompt.push_str(&format!("System: {}\n", content)),
+                    "user" => prompt.push_str(&format!("User: {}\n", content)),
+                    "assistant" => prompt.push_str(&format!("Assistant: {}\n", content)),
+                    _ => {}
+                }
+            }
+            prompt.push_str("Assistant: ");
+        }
+        Ok(prompt)
+    }
+    fn generate_tokens(
+        &mut self,
+        prompt_tokens: Vec<u32>,
+        config: &GenerationConfig,
+        mut callback: Option<impl FnMut(&str)>,
+    ) -> CandleResult<Vec<u32>> {
+        let mut text_gen = TextGeneration::from_config(config);
+        text_gen.set_eos_token_id(self.eos_token_id);
+        text_gen.set_tokens(prompt_tokens.clone());
+        let mut all_tokens = prompt_tokens.clone();
+        let start_gen = all_tokens.len();
+        for index in 0..config.max_length {
+            let context_size = if index > 0 { 1 } else { all_tokens.len() };
+            let start_pos = all_tokens.len().saturating_sub(context_size);
+            let ctxt = &all_tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = match &mut self.model {
+                PhiVariant::Phi2(model) => model.forward(&input)?,
+                PhiVariant::Phi3(model) => model.forward(&input, start_pos)?,
+            };
+            let logits = logits.squeeze(0)?;
+            // Handle different output shapes
+            let logits = if logits.dims().len() == 2 {
+                let seq_len = logits.dim(0)?;
+                logits.narrow(0, seq_len - 1, 1)?.squeeze(0)?
+            } else {
+                logits
+            };
+            let logits = logits.to_dtype(DType::F32)?;
+            let next_token = text_gen.sample_next_token(
+                &logits,
+                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
+            )?;
+            all_tokens.push(next_token);
+            // Stream callback
+            if let Some(ref mut cb) = callback {
+                if config.debug_tokens {
+                    let token_piece = self.tokenizer.token_to_piece(next_token)?;
+                    cb(&format!("[{}:{}]", next_token, token_piece));
+                } else {
+                    let decoded_text = self.tokenizer.decode_incremental(&all_tokens, all_tokens.len() - 1)?;
+                    cb(&decoded_text);
+                }
+            }
+            // Check stop conditions
+            if text_gen.should_stop(next_token, config.max_length) {
+                break;
+            }
+            // Check stop sequences
+            let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
+            if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {
+                break;
+            }
+        }
+        Ok(if config.include_prompt {
+            all_tokens
+        } else {
+            all_tokens[start_gen..].to_vec()
+        })
+    }
+}
+impl TextGenerator for Phi {
+    fn generate(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens(prompt_tokens, config, None::<fn(&str)>)?;
+        if config.debug_tokens {
+            self.tokenizer.format_tokens_with_debug(&output_tokens)
+        } else {
+            self.tokenizer.decode(&output_tokens, true)
+        }
+    }
+    fn generate_stream(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+        mut callback: impl FnMut(&str),
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens(prompt_tokens, config, Some(&mut callback))?;
+        self.tokenizer.decode(&output_tokens, true)
+    }
+    fn model_name(&self) -> &str {
+        &self.model_id
+    }
+    fn device(&self) -> &Device {
+        &self.device
+    }
+    fn clear_cache(&mut self) {
+        self.clear_kv_cache();
+    }
+}